1 /*
2 * Copyright(c) 2019 Intel Corporation
3 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
4 *
5 * This source code is subject to the terms of the BSD 2 Clause License and
6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7 * was not distributed with this source code in the LICENSE file, you can
8 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
9 * Media Patent License 1.0 was not distributed with this source code in the
10 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
11 */
12
13 #include <stdlib.h>
14
15 #include "EbInterPrediction.h"
16 #include "convolve.h"
17 #include "common_dsp_rtcd.h"
18 #include "EbUtility.h"
19 //#include "EbRateDistortionCost.h"
20
21 #define MVBOUNDLOW \
22 36 // (80-71)<<2 // 80 = ReferencePadding ; minus 71 is derived from the expression -64 + 1 - 8, and plus 7 is derived from expression -1 + 8
23 #define MVBOUNDHIGH 348 // (80+7)<<2
24 #define REFPADD_QPEL 320 // (16+64)<<2
25
26 #define AOM_INTERP_EXTEND 4
27
28 #define SCALE_NUMERATOR 8
29
30 #define SCALE_SUBPEL_BITS 10
31 #define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
32 #define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
33 #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
34 #define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
35
36 #define BIL_SUBPEL_BITS 3
37 #define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS)
38
39 #define ROUND0_BITS 3
40 #define COMPOUND_ROUND1_BITS 7
41
42 static WedgeMasksType wedge_masks[BlockSizeS_ALL][2];
43
is_masked_compound_type(COMPOUND_TYPE type)44 int is_masked_compound_type(COMPOUND_TYPE type) {
45 return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD);
46 }
47
48
svt_aom_highbd_subtract_block_c(int rows,int cols,int16_t * diff,ptrdiff_t diff_stride,const uint8_t * src8,ptrdiff_t src_stride,const uint8_t * pred8,ptrdiff_t pred_stride,int bd)49 void svt_aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff, ptrdiff_t diff_stride,
50 const uint8_t *src8, ptrdiff_t src_stride, const uint8_t *pred8,
51 ptrdiff_t pred_stride, int bd) {
52 uint16_t *src = (uint16_t *)(src8);
53 uint16_t *pred = (uint16_t *)(pred8);
54 (void)bd;
55
56 for (int r = 0; r < rows; r++) {
57 for (int c = 0; c < cols; c++) { diff[c] = src[c] - pred[c]; }
58
59 diff += diff_stride;
60 pred += pred_stride;
61 src += src_stride;
62 }
63 }
64
svt_aom_subtract_block_c(int rows,int cols,int16_t * diff,ptrdiff_t diff_stride,const uint8_t * src,ptrdiff_t src_stride,const uint8_t * pred,ptrdiff_t pred_stride)65 void svt_aom_subtract_block_c(int rows, int cols, int16_t *diff, ptrdiff_t diff_stride,
66 const uint8_t *src, ptrdiff_t src_stride, const uint8_t *pred,
67 ptrdiff_t pred_stride) {
68
69 for (int r = 0; r < rows; r++) {
70 for (int c = 0; c < cols; c++) diff[c] = src[c] - pred[c];
71
72 diff += diff_stride;
73 pred += pred_stride;
74 src += src_stride;
75 }
76 }
77
diffwtd_mask(uint8_t * mask,int which_inverse,int mask_base,const uint8_t * src0,int src0_stride,const uint8_t * src1,int src1_stride,int h,int w)78 static void diffwtd_mask(uint8_t *mask, int which_inverse, int mask_base, const uint8_t *src0,
79 int src0_stride, const uint8_t *src1, int src1_stride, int h, int w) {
80 for (int i = 0; i < h; ++i) {
81 for (int j = 0; j < w; ++j) {
82 int diff = abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]);
83 int m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
84 mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
85 }
86 }
87 }
diffwtd_mask_highbd(uint8_t * mask,int which_inverse,int mask_base,const uint16_t * src0,int src0_stride,const uint16_t * src1,int src1_stride,int h,int w,const unsigned int bd)88 static AOM_FORCE_INLINE void diffwtd_mask_highbd(uint8_t *mask, int which_inverse, int mask_base,
89 const uint16_t *src0, int src0_stride,
90 const uint16_t *src1, int src1_stride, int h,
91 int w, const unsigned int bd) {
92 assert(bd >= 8);
93 if (bd == 8) {
94 if (which_inverse) {
95 for (int i = 0; i < h; ++i) {
96 for (int j = 0; j < w; ++j) {
97 int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
98 unsigned int m = negative_to_zero(mask_base + diff);
99 m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
100 mask[j] = AOM_BLEND_A64_MAX_ALPHA - m;
101 }
102 src0 += src0_stride;
103 src1 += src1_stride;
104 mask += w;
105 }
106 } else {
107 for (int i = 0; i < h; ++i) {
108 for (int j = 0; j < w; ++j) {
109 int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
110 unsigned int m = negative_to_zero(mask_base + diff);
111 m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
112 mask[j] = m;
113 }
114 src0 += src0_stride;
115 src1 += src1_stride;
116 mask += w;
117 }
118 }
119 } else {
120 const unsigned int bd_shift = bd - 8;
121 if (which_inverse) {
122 for (int i = 0; i < h; ++i) {
123 for (int j = 0; j < w; ++j) {
124 int diff = (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
125 unsigned int m = negative_to_zero(mask_base + diff);
126 m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
127 mask[j] = AOM_BLEND_A64_MAX_ALPHA - m;
128 }
129 src0 += src0_stride;
130 src1 += src1_stride;
131 mask += w;
132 }
133 } else {
134 for (int i = 0; i < h; ++i) {
135 for (int j = 0; j < w; ++j) {
136 int diff = (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
137 unsigned int m = negative_to_zero(mask_base + diff);
138 m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
139 mask[j] = m;
140 }
141 src0 += src0_stride;
142 src1 += src1_stride;
143 mask += w;
144 }
145 }
146 }
147 }
svt_av1_build_compound_diffwtd_mask_highbd_c(uint8_t * mask,DIFFWTD_MASK_TYPE mask_type,const uint8_t * src0,int src0_stride,const uint8_t * src1,int src1_stride,int h,int w,int bd)148 void svt_av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type,
149 const uint8_t *src0, int src0_stride,
150 const uint8_t *src1, int src1_stride, int h, int w,
151 int bd) {
152 switch (mask_type) {
153 case DIFFWTD_38:
154 diffwtd_mask_highbd(
155 mask, 0, 38, (uint16_t *)src0, src0_stride, (uint16_t *)src1, src1_stride, h, w, bd);
156 break;
157 case DIFFWTD_38_INV:
158 diffwtd_mask_highbd(
159 mask, 1, 38, (uint16_t *)src0, src0_stride, (uint16_t *)src1, src1_stride, h, w, bd);
160 break;
161 default: assert(0);
162 }
163 }
164
svt_av1_build_compound_diffwtd_mask_c(uint8_t * mask,DIFFWTD_MASK_TYPE mask_type,const uint8_t * src0,int src0_stride,const uint8_t * src1,int src1_stride,int h,int w)165 void svt_av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type,
166 const uint8_t *src0, int src0_stride, const uint8_t *src1,
167 int src1_stride, int h, int w) {
168 switch (mask_type) {
169 case DIFFWTD_38: diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w); break;
170 case DIFFWTD_38_INV:
171 diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w);
172 break;
173 default: assert(0);
174 }
175 }
176
177
178 // Note: Expect val to be in q4 precision
scaled_x(int32_t val,const ScaleFactors * sf)179 static INLINE int32_t scaled_x(int32_t val, const ScaleFactors *sf) {
180 const int off = (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
181 const int64_t tval = (int64_t)val * sf->x_scale_fp + off;
182 return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval, REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
183 }
184
185 // Note: Expect val to be in q4 precision
scaled_y(int32_t val,const ScaleFactors * sf)186 static INLINE int32_t scaled_y(int32_t val, const ScaleFactors *sf) {
187 const int32_t off = (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
188 const int64_t tval = (int64_t)val * sf->y_scale_fp + off;
189 return (int32_t)ROUND_POWER_OF_TWO_SIGNED_64(tval, REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
190 }
191
192 // Note: Expect val to be in q4 precision
unscaled_value(int32_t val,const ScaleFactors * sf)193 static int32_t unscaled_value(int32_t val, const ScaleFactors *sf) {
194 (void)sf;
195 return val << SCALE_EXTRA_BITS;
196 }
197
get_fixed_point_scale_factor(int32_t other_size,int32_t this_size)198 static int32_t get_fixed_point_scale_factor(int32_t other_size, int32_t this_size) {
199 // Calculate scaling factor once for each reference frame
200 // and use fixed point scaling factors in decoding and encoding routines.
201 // Hardware implementations can calculate scale factor in device driver
202 // and use multiplication and shifting on hardware instead of division.
203 return ((other_size << REF_SCALE_SHIFT) + this_size / 2) / this_size;
204 }
205
206 // Given the fixed point scale, calculate coarse point scale.
fixed_point_scale_to_coarse_point_scale(int32_t scale_fp)207 static int32_t fixed_point_scale_to_coarse_point_scale(int32_t scale_fp) {
208 return ROUND_POWER_OF_TWO(scale_fp, REF_SCALE_SHIFT - SCALE_SUBPEL_BITS);
209 }
210
211 // Note: x and y are integer precision, mvq4 is q4 precision.
svt_av1_scale_mv(const MV * mvq4,int x,int y,const ScaleFactors * sf)212 MV32 svt_av1_scale_mv(const MV *mvq4, int x, int y, const ScaleFactors *sf) {
213 const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf);
214 const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf);
215 const MV32 res = {scaled_y((y << SUBPEL_BITS) + mvq4->row, sf) - y_off_q4,
216 scaled_x((x << SUBPEL_BITS) + mvq4->col, sf) - x_off_q4};
217 return res;
218 }
219
svt_av1_setup_scale_factors_for_frame(ScaleFactors * sf,int other_w,int other_h,int this_w,int this_h)220 void svt_av1_setup_scale_factors_for_frame(ScaleFactors *sf, int other_w, int other_h, int this_w,
221 int this_h) {
222 if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
223 sf->x_scale_fp = REF_INVALID_SCALE;
224 sf->y_scale_fp = REF_INVALID_SCALE;
225 return;
226 }
227
228 sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
229 sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
230
231 sf->x_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->x_scale_fp);
232 sf->y_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->y_scale_fp);
233
234 if (av1_is_scaled(sf)) {
235 sf->scale_value_x = scaled_x;
236 sf->scale_value_y = scaled_y;
237 } else {
238 sf->scale_value_x = unscaled_value;
239 sf->scale_value_y = unscaled_value;
240 }
241 }
242
has_scale(int32_t xs,int32_t ys)243 static INLINE int32_t has_scale(int32_t xs, int32_t ys) {
244 return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
245 }
246
revert_scale_extra_bits(SubpelParams * sp)247 static INLINE void revert_scale_extra_bits(SubpelParams *sp) {
248 sp->subpel_x >>= SCALE_EXTRA_BITS;
249 sp->subpel_y >>= SCALE_EXTRA_BITS;
250 sp->xs >>= SCALE_EXTRA_BITS;
251 sp->ys >>= SCALE_EXTRA_BITS;
252 assert(sp->subpel_x < SUBPEL_SHIFTS);
253 assert(sp->subpel_y < SUBPEL_SHIFTS);
254 assert(sp->xs <= SUBPEL_SHIFTS);
255 assert(sp->ys <= SUBPEL_SHIFTS);
256 }
257
258 DECLARE_ALIGNED(256, const InterpKernel, sub_pel_filters_8[SUBPEL_SHIFTS]) = {
259 {0, 0, 0, 128, 0, 0, 0, 0},
260 {0, 2, -6, 126, 8, -2, 0, 0},
261 {0, 2, -10, 122, 18, -4, 0, 0},
262 {0, 2, -12, 116, 28, -8, 2, 0},
263 {0, 2, -14, 110, 38, -10, 2, 0},
264 {0, 2, -14, 102, 48, -12, 2, 0},
265 {0, 2, -16, 94, 58, -12, 2, 0},
266 {0, 2, -14, 84, 66, -12, 2, 0},
267 {0, 2, -14, 76, 76, -14, 2, 0},
268 {0, 2, -12, 66, 84, -14, 2, 0},
269 {0, 2, -12, 58, 94, -16, 2, 0},
270 {0, 2, -12, 48, 102, -14, 2, 0},
271 {0, 2, -10, 38, 110, -14, 2, 0},
272 {0, 2, -8, 28, 116, -12, 2, 0},
273 {0, 0, -4, 18, 122, -10, 2, 0},
274 {0, 0, -2, 8, 126, -6, 2, 0}};
275 DECLARE_ALIGNED(256, const InterpKernel, sub_pel_filters_4[SUBPEL_SHIFTS]) = {
276 {0, 0, 0, 128, 0, 0, 0, 0},
277 {0, 0, -4, 126, 8, -2, 0, 0},
278 {0, 0, -8, 122, 18, -4, 0, 0},
279 {0, 0, -10, 116, 28, -6, 0, 0},
280 {0, 0, -12, 110, 38, -8, 0, 0},
281 {0, 0, -12, 102, 48, -10, 0, 0},
282 {0, 0, -14, 94, 58, -10, 0, 0},
283 {0, 0, -12, 84, 66, -10, 0, 0},
284 {0, 0, -12, 76, 76, -12, 0, 0},
285 {0, 0, -10, 66, 84, -12, 0, 0},
286 {0, 0, -10, 58, 94, -14, 0, 0},
287 {0, 0, -10, 48, 102, -12, 0, 0},
288 {0, 0, -8, 38, 110, -12, 0, 0},
289 {0, 0, -6, 28, 116, -10, 0, 0},
290 {0, 0, -4, 18, 122, -8, 0, 0},
291 {0, 0, -2, 8, 126, -4, 0, 0}};
292
293 #define MAX_FILTER_TAP 8
get_relative_dist_enc(SeqHeader * seq_header,int ref_hint,int order_hint)294 int get_relative_dist_enc(SeqHeader *seq_header, int ref_hint, int order_hint) {
295 int diff, m;
296 if (!seq_header->order_hint_info.enable_order_hint) return 0;
297 diff = ref_hint - order_hint;
298 m = 1 << (seq_header->order_hint_info.order_hint_bits - 1);
299 diff = (diff & (m - 1)) - (diff & m);
300 return diff;
301 }
302
303 static const int quant_dist_weight[4][2] = {{2, 3}, {2, 5}, {2, 7}, {1, MAX_FRAME_DISTANCE}};
304 static const int quant_dist_lookup_table[2][4][2] = {
305 {{9, 7}, {11, 5}, {12, 4}, {13, 3}},
306 {{7, 9}, {5, 11}, {4, 12}, {3, 13}},
307 };
308
svt_av1_dist_wtd_comp_weight_assign(SeqHeader * seq_header,int cur_frame_index,int bck_frame_index,int fwd_frame_index,int compound_idx,int order_idx,int * fwd_offset,int * bck_offset,int * use_dist_wtd_comp_avg,int is_compound)309 void svt_av1_dist_wtd_comp_weight_assign(SeqHeader *seq_header, int cur_frame_index,
310 int bck_frame_index, int fwd_frame_index, int compound_idx,
311 int order_idx, int *fwd_offset, int *bck_offset,
312 int *use_dist_wtd_comp_avg, int is_compound) {
313 assert(fwd_offset != NULL && bck_offset != NULL);
314 if (!is_compound || compound_idx) {
315 *use_dist_wtd_comp_avg = 0;
316 return;
317 }
318
319 *use_dist_wtd_comp_avg = 1;
320
321 int d0 = clamp(abs(get_relative_dist_enc(seq_header, fwd_frame_index, cur_frame_index)),
322 0,
323 MAX_FRAME_DISTANCE);
324 int d1 = clamp(abs(get_relative_dist_enc(seq_header, cur_frame_index, bck_frame_index)),
325 0,
326 MAX_FRAME_DISTANCE);
327
328 const int order = d0 <= d1;
329
330 if (d0 == 0 || d1 == 0) {
331 *fwd_offset = quant_dist_lookup_table[order_idx][3][order];
332 *bck_offset = quant_dist_lookup_table[order_idx][3][1 - order];
333 return;
334 }
335
336 int i;
337 for (i = 0; i < 3; ++i) {
338 int c0 = quant_dist_weight[i][order];
339 int c1 = quant_dist_weight[i][!order];
340 int d0_c0 = d0 * c0;
341 int d1_c1 = d1 * c1;
342 if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
343 }
344
345 *fwd_offset = quant_dist_lookup_table[order_idx][i][order];
346 *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order];
347 }
348
svt_av1_convolve_2d_sr_c(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)349 void svt_av1_convolve_2d_sr_c(const uint8_t *src, int32_t src_stride, uint8_t *dst,
350 int32_t dst_stride, int32_t w, int32_t h,
351 InterpFilterParams *filter_params_x,
352 InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
353 const int32_t subpel_y_q4, ConvolveParams *conv_params) {
354 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
355 int32_t im_h = h + filter_params_y->taps - 1;
356 int32_t im_stride = w;
357 const int32_t fo_vert = filter_params_y->taps / 2 - 1;
358 const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
359 const int32_t bd = 8;
360 const int32_t bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
361
362 // horizontal filter
363 const uint8_t *src_horiz = src - fo_vert * src_stride;
364 const int16_t *x_filter =
365 av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
366 for (int32_t y = 0; y < im_h; ++y) {
367 for (int32_t x = 0; x < w; ++x) {
368 int32_t sum = (1 << (bd + FILTER_BITS - 1));
369 for (int32_t k = 0; k < filter_params_x->taps; ++k)
370 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
371 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
372 im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
373 }
374 }
375
376 // vertical filter
377 int16_t * src_vert = im_block + fo_vert * im_stride;
378 const int16_t *y_filter =
379 av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
380 const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
381 for (int32_t y = 0; y < h; ++y) {
382 for (int32_t x = 0; x < w; ++x) {
383 int32_t sum = 1 << offset_bits;
384 for (int32_t k = 0; k < filter_params_y->taps; ++k)
385 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
386 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
387 int16_t res = (ConvBufType)(ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
388 ((1 << (offset_bits - conv_params->round_1)) +
389 (1 << (offset_bits - conv_params->round_1 - 1))));
390 dst[y * dst_stride + x] = (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), 8);
391 }
392 }
393 }
394
svt_av1_convolve_y_sr_c(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)395 void svt_av1_convolve_y_sr_c(const uint8_t *src, int32_t src_stride, uint8_t *dst,
396 int32_t dst_stride, int32_t w, int32_t h,
397 InterpFilterParams *filter_params_x,
398 InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
399 const int32_t subpel_y_q4, ConvolveParams *conv_params) {
400 assert(filter_params_y != NULL);
401 const int32_t fo_vert = filter_params_y->taps / 2 - 1;
402 (void)filter_params_x;
403 (void)subpel_x_q4;
404 (void)conv_params;
405
406 assert(conv_params->round_0 <= FILTER_BITS);
407 assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
408 ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
409
410 // vertical filter
411 const int16_t *y_filter =
412 av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
413
414 for (int32_t y = 0; y < h; ++y) {
415 for (int32_t x = 0; x < w; ++x) {
416 int32_t res = 0;
417 for (int32_t k = 0; k < filter_params_y->taps; ++k)
418 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
419 dst[y * dst_stride + x] =
420 (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), 8);
421 }
422 }
423 }
424
svt_av1_convolve_x_sr_c(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)425 void svt_av1_convolve_x_sr_c(const uint8_t *src, int32_t src_stride, uint8_t *dst,
426 int32_t dst_stride, int32_t w, int32_t h,
427 InterpFilterParams *filter_params_x,
428 InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
429 const int32_t subpel_y_q4, ConvolveParams *conv_params) {
430 const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
431 const int32_t bits = FILTER_BITS - conv_params->round_0;
432 (void)filter_params_y;
433 (void)subpel_y_q4;
434 (void)conv_params;
435
436 assert(bits >= 0);
437 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
438 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
439
440 // horizontal filter
441 const int16_t *x_filter =
442 av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
443
444 for (int32_t y = 0; y < h; ++y) {
445 for (int32_t x = 0; x < w; ++x) {
446 int32_t res = 0;
447 for (int32_t k = 0; k < filter_params_x->taps; ++k)
448 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
449 res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
450 dst[y * dst_stride + x] = (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), 8);
451 }
452 }
453 }
454
svt_av1_convolve_2d_copy_sr_c(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)455 void svt_av1_convolve_2d_copy_sr_c(const uint8_t *src, int32_t src_stride, uint8_t *dst,
456 int32_t dst_stride, int32_t w, int32_t h,
457 InterpFilterParams *filter_params_x,
458 InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
459 const int32_t subpel_y_q4, ConvolveParams *conv_params) {
460 (void)filter_params_x;
461 (void)filter_params_y;
462 (void)subpel_x_q4;
463 (void)subpel_y_q4;
464 (void)conv_params;
465
466 for (int32_t y = 0; y < h; ++y) {
467 for (int32_t x = 0; x < w; ++x) dst[y * dst_stride + x] = src[y * src_stride + x];
468 }
469 }
470
svt_av1_convolve_2d_scale_c(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)471 void svt_av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
472 int w, int h, const InterpFilterParams *filter_params_x,
473 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
474 const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
475 ConvolveParams *conv_params) {
476 int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
477 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + filter_params_y->taps;
478 CONV_BUF_TYPE *dst16 = conv_params->dst;
479 const int dst16_stride = conv_params->dst_stride;
480 const int bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
481 assert(bits >= 0);
482 int im_stride = w;
483 const int fo_vert = filter_params_y->taps / 2 - 1;
484 const int fo_horiz = filter_params_x->taps / 2 - 1;
485 const int bd = 8;
486
487 // horizontal filter
488 const uint8_t *src_horiz = src - fo_vert * src_stride;
489 for (int y = 0; y < im_h; ++y) {
490 int x_qn = subpel_x_qn;
491 for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
492 const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
493 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
494 assert(x_filter_idx < SUBPEL_SHIFTS);
495 const int16_t *x_filter =
496 av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
497 int32_t sum = (1 << (bd + FILTER_BITS - 1));
498 for (int k = 0; k < filter_params_x->taps; ++k) {
499 sum += x_filter[k] * src_x[k - fo_horiz];
500 }
501 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
502 im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
503 }
504 src_horiz += src_stride;
505 }
506
507 // vertical filter
508 int16_t * src_vert = im_block + fo_vert * im_stride;
509 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
510 for (int x = 0; x < w; ++x) {
511 int y_qn = subpel_y_qn;
512 for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
513 const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
514 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
515 assert(y_filter_idx < SUBPEL_SHIFTS);
516 const int16_t *y_filter =
517 av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
518 int32_t sum = 1 << offset_bits;
519 for (int k = 0; k < filter_params_y->taps; ++k) {
520 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
521 }
522 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
523 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
524 if (conv_params->is_compound) {
525 if (conv_params->do_average) {
526 int32_t tmp = dst16[y * dst16_stride + x];
527 if (conv_params->use_dist_wtd_comp_avg) {
528 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
529 tmp = tmp >> DIST_PRECISION_BITS;
530 } else {
531 tmp += res;
532 tmp = tmp >> 1;
533 }
534 /* Subtract round offset and convolve round */
535 tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
536 (1 << (offset_bits - conv_params->round_1 - 1)));
537 dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
538 } else {
539 dst16[y * dst16_stride + x] = res;
540 }
541 } else {
542 /* Subtract round offset and convolve round */
543 int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
544 (1 << (offset_bits - conv_params->round_1 - 1)));
545 dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
546 }
547 }
548 src_vert++;
549 }
550 }
551
svt_av1_jnt_convolve_2d_c(const uint8_t * src,int32_t src_stride,uint8_t * dst8,int32_t dst8_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)552 void svt_av1_jnt_convolve_2d_c(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
553 int32_t dst8_stride, int32_t w, int32_t h,
554 InterpFilterParams *filter_params_x,
555 InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
556 const int32_t subpel_y_q4, ConvolveParams *conv_params) {
557 ConvBufType * dst = conv_params->dst;
558 int32_t dst_stride = conv_params->dst_stride;
559 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
560 int32_t im_h = h + filter_params_y->taps - 1;
561 int32_t im_stride = w;
562 const int32_t fo_vert = filter_params_y->taps / 2 - 1;
563 const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
564 const int32_t bd = 8;
565 const int32_t round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
566
567 // horizontal filter
568 const uint8_t *src_horiz = src - fo_vert * src_stride;
569 const int16_t *x_filter =
570 av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
571 for (int32_t y = 0; y < im_h; ++y) {
572 for (int32_t x = 0; x < w; ++x) {
573 int32_t sum = (1 << (bd + FILTER_BITS - 1));
574 for (int32_t k = 0; k < filter_params_x->taps; ++k)
575 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
576 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
577 im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
578 }
579 }
580
581 // vertical filter
582 int16_t * src_vert = im_block + fo_vert * im_stride;
583 const int16_t *y_filter =
584 av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
585 const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
586 for (int32_t y = 0; y < h; ++y) {
587 for (int32_t x = 0; x < w; ++x) {
588 int32_t sum = 1 << offset_bits;
589 for (int32_t k = 0; k < filter_params_y->taps; ++k)
590 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
591 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
592 ConvBufType res = (ConvBufType)ROUND_POWER_OF_TWO(sum, conv_params->round_1);
593 if (conv_params->do_average) {
594 int32_t tmp = dst[y * dst_stride + x];
595 if (conv_params->use_jnt_comp_avg) {
596 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
597 tmp = tmp >> DIST_PRECISION_BITS;
598 } else {
599 tmp += res;
600 tmp = tmp >> 1;
601 }
602 tmp -= (1 << (offset_bits - conv_params->round_1)) +
603 (1 << (offset_bits - conv_params->round_1 - 1));
604 dst8[y * dst8_stride + x] =
605 (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), 8);
606 } else
607 dst[y * dst_stride + x] = res;
608 }
609 }
610 }
611
svt_av1_jnt_convolve_y_c(const uint8_t * src,int32_t src_stride,uint8_t * dst8,int32_t dst8_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)612 void svt_av1_jnt_convolve_y_c(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
613 int32_t dst8_stride, int32_t w, int32_t h,
614 InterpFilterParams *filter_params_x,
615 InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
616 const int32_t subpel_y_q4, ConvolveParams *conv_params) {
617 ConvBufType * dst = conv_params->dst;
618 int32_t dst_stride = conv_params->dst_stride;
619 const int32_t fo_vert = filter_params_y->taps / 2 - 1;
620 const int32_t bits = FILTER_BITS - conv_params->round_0;
621 const int32_t bd = 8;
622 const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
623 const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
624 (1 << (offset_bits - conv_params->round_1 - 1));
625 const int32_t round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
626 (void)filter_params_x;
627 (void)subpel_x_q4;
628
629 // vertical filter
630 const int16_t *y_filter =
631 av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
632 for (int32_t y = 0; y < h; ++y) {
633 for (int32_t x = 0; x < w; ++x) {
634 int32_t res = 0;
635 for (int32_t k = 0; k < filter_params_y->taps; ++k)
636 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
637 res *= (1 << bits);
638 res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
639
640 if (conv_params->do_average) {
641 int32_t tmp = dst[y * dst_stride + x];
642 if (conv_params->use_jnt_comp_avg) {
643 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
644 tmp = tmp >> DIST_PRECISION_BITS;
645 } else {
646 tmp += res;
647 tmp = tmp >> 1;
648 }
649 tmp -= round_offset;
650 dst8[y * dst8_stride + x] =
651 (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), 8);
652 } else
653 dst[y * dst_stride + x] = (ConvBufType)res;
654 }
655 }
656 }
657
svt_av1_jnt_convolve_x_c(const uint8_t * src,int32_t src_stride,uint8_t * dst8,int32_t dst8_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)658 void svt_av1_jnt_convolve_x_c(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
659 int32_t dst8_stride, int32_t w, int32_t h,
660 InterpFilterParams *filter_params_x,
661 InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
662 const int32_t subpel_y_q4, ConvolveParams *conv_params) {
663 ConvBufType * dst = conv_params->dst;
664 int32_t dst_stride = conv_params->dst_stride;
665 const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
666 const int32_t bits = FILTER_BITS - conv_params->round_1;
667 const int32_t bd = 8;
668 const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
669 const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
670 (1 << (offset_bits - conv_params->round_1 - 1));
671 const int32_t round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
672 (void)filter_params_y;
673 (void)subpel_y_q4;
674
675 // horizontal filter
676 const int16_t *x_filter =
677 av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
678 for (int32_t y = 0; y < h; ++y) {
679 for (int32_t x = 0; x < w; ++x) {
680 int32_t res = 0;
681 for (int32_t k = 0; k < filter_params_x->taps; ++k)
682 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
683 res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
684 res += round_offset;
685
686 if (conv_params->do_average) {
687 int32_t tmp = dst[y * dst_stride + x];
688 if (conv_params->use_jnt_comp_avg) {
689 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
690 tmp = tmp >> DIST_PRECISION_BITS;
691 } else {
692 tmp += res;
693 tmp = tmp >> 1;
694 }
695 tmp -= round_offset;
696 dst8[y * dst8_stride + x] =
697 (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), 8);
698 } else
699 dst[y * dst_stride + x] = (ConvBufType)res;
700 }
701 }
702 }
703
svt_av1_jnt_convolve_2d_copy_c(const uint8_t * src,int32_t src_stride,uint8_t * dst8,int32_t dst8_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)704 void svt_av1_jnt_convolve_2d_copy_c(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
705 int32_t dst8_stride, int32_t w, int32_t h,
706 InterpFilterParams *filter_params_x,
707 InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
708 const int32_t subpel_y_q4, ConvolveParams *conv_params) {
709 ConvBufType * dst = conv_params->dst;
710 int32_t dst_stride = conv_params->dst_stride;
711 const int32_t bits = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
712 const int32_t bd = 8;
713 const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
714 const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
715 (1 << (offset_bits - conv_params->round_1 - 1));
716 (void)filter_params_x;
717 (void)filter_params_y;
718 (void)subpel_x_q4;
719 (void)subpel_y_q4;
720
721 for (int32_t y = 0; y < h; ++y) {
722 for (int32_t x = 0; x < w; ++x) {
723 ConvBufType res = src[y * src_stride + x] << bits;
724 res += (ConvBufType)round_offset;
725
726 if (conv_params->do_average) {
727 int32_t tmp = dst[y * dst_stride + x];
728 if (conv_params->use_jnt_comp_avg) {
729 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
730 tmp = tmp >> DIST_PRECISION_BITS;
731 } else {
732 tmp += res;
733 tmp = tmp >> 1;
734 }
735 tmp -= round_offset;
736 dst8[y * dst8_stride + x] =
737 (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), 8);
738 } else
739 dst[y * dst_stride + x] = res;
740 }
741 }
742 }
743
svt_av1_highbd_convolve_2d_copy_sr_c(const uint16_t * src,int32_t src_stride,uint16_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params,int32_t bd)744 void svt_av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int32_t src_stride, uint16_t *dst,
745 int32_t dst_stride, int32_t w, int32_t h,
746 const InterpFilterParams *filter_params_x,
747 const InterpFilterParams *filter_params_y,
748 const int32_t subpel_x_q4, const int32_t subpel_y_q4,
749 ConvolveParams *conv_params, int32_t bd) {
750 (void)filter_params_x;
751 (void)filter_params_y;
752 (void)subpel_x_q4;
753 (void)subpel_y_q4;
754 (void)conv_params;
755 (void)bd;
756
757 for (int32_t y = 0; y < h; ++y) {
758 for (int32_t x = 0; x < w; ++x) dst[y * dst_stride + x] = src[y * src_stride + x];
759 }
760 }
761
svt_av1_highbd_convolve_x_sr_c(const uint16_t * src,int32_t src_stride,uint16_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params,int32_t bd)762 void svt_av1_highbd_convolve_x_sr_c(const uint16_t *src, int32_t src_stride, uint16_t *dst,
763 int32_t dst_stride, int32_t w, int32_t h,
764 const InterpFilterParams *filter_params_x,
765 const InterpFilterParams *filter_params_y,
766 const int32_t subpel_x_q4, const int32_t subpel_y_q4,
767 ConvolveParams *conv_params, int32_t bd) {
768 const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
769 const int32_t bits = FILTER_BITS - conv_params->round_0;
770 (void)filter_params_y;
771 (void)subpel_y_q4;
772
773 assert(bits >= 0);
774 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
775 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
776
777 // horizontal filter
778 const int16_t *x_filter =
779 av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
780 for (int32_t y = 0; y < h; ++y) {
781 for (int32_t x = 0; x < w; ++x) {
782 int32_t res = 0;
783 for (int32_t k = 0; k < filter_params_x->taps; ++k)
784 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
785 res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
786 dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
787 }
788 }
789 }
790
svt_av1_highbd_convolve_y_sr_c(const uint16_t * src,int32_t src_stride,uint16_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params,int32_t bd)791 void svt_av1_highbd_convolve_y_sr_c(const uint16_t *src, int32_t src_stride, uint16_t *dst,
792 int32_t dst_stride, int32_t w, int32_t h,
793 const InterpFilterParams *filter_params_x,
794 const InterpFilterParams *filter_params_y,
795 const int32_t subpel_x_q4, const int32_t subpel_y_q4,
796 ConvolveParams *conv_params, int32_t bd) {
797 assert(filter_params_y != NULL);
798 const int32_t fo_vert = filter_params_y->taps / 2 - 1;
799 (void)filter_params_x;
800 (void)subpel_x_q4;
801 (void)conv_params;
802
803 assert(conv_params->round_0 <= FILTER_BITS);
804 assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
805 ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
806 // vertical filter
807 const int16_t *y_filter =
808 av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
809 for (int32_t y = 0; y < h; ++y) {
810 for (int32_t x = 0; x < w; ++x) {
811 int32_t res = 0;
812 for (int32_t k = 0; k < filter_params_y->taps; ++k)
813 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
814 dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
815 }
816 }
817 }
818
svt_av1_highbd_convolve_2d_sr_c(const uint16_t * src,int32_t src_stride,uint16_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params,int32_t bd)819 void svt_av1_highbd_convolve_2d_sr_c(const uint16_t *src, int32_t src_stride, uint16_t *dst,
820 int32_t dst_stride, int32_t w, int32_t h,
821 const InterpFilterParams *filter_params_x,
822 const InterpFilterParams *filter_params_y,
823 const int32_t subpel_x_q4, const int32_t subpel_y_q4,
824 ConvolveParams *conv_params, int32_t bd) {
825 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
826 int32_t im_h = h + filter_params_y->taps - 1;
827 int32_t im_stride = w;
828 const int32_t fo_vert = filter_params_y->taps / 2 - 1;
829 const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
830 const int32_t bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
831 assert(bits >= 0);
832
833 // horizontal filter
834 const uint16_t *src_horiz = src - fo_vert * src_stride;
835 const int16_t * x_filter =
836 av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
837 for (int32_t y = 0; y < im_h; ++y) {
838 for (int32_t x = 0; x < w; ++x) {
839 int32_t sum = (1 << (bd + FILTER_BITS - 1));
840 for (int32_t k = 0; k < filter_params_x->taps; ++k)
841 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
842 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
843 im_block[y * im_stride + x] =
844 (ConvBufType)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
845 }
846 }
847
848 // vertical filter
849 int16_t * src_vert = im_block + fo_vert * im_stride;
850 const int16_t *y_filter =
851 av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
852 const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
853 for (int32_t y = 0; y < h; ++y) {
854 for (int32_t x = 0; x < w; ++x) {
855 int32_t sum = 1 << offset_bits;
856 for (int32_t k = 0; k < filter_params_y->taps; ++k)
857 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
858 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
859 int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
860 ((1 << (offset_bits - conv_params->round_1)) +
861 (1 << (offset_bits - conv_params->round_1 - 1)));
862 dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
863 }
864 }
865 }
866
svt_av1_highbd_convolve_2d_scale_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params,int bd)867 void svt_av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst,
868 int dst_stride, int w, int h,
869 const InterpFilterParams *filter_params_x,
870 const InterpFilterParams *filter_params_y,
871 const int subpel_x_qn, const int x_step_qn,
872 const int subpel_y_qn, const int y_step_qn,
873 ConvolveParams *conv_params, int bd) {
874 int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
875 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + filter_params_y->taps;
876 int im_stride = w;
877 const int fo_vert = filter_params_y->taps / 2 - 1;
878 const int fo_horiz = filter_params_x->taps / 2 - 1;
879 CONV_BUF_TYPE *dst16 = conv_params->dst;
880 const int dst16_stride = conv_params->dst_stride;
881 const int bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
882 assert(bits >= 0);
883 // horizontal filter
884 const uint16_t *src_horiz = src - fo_vert * src_stride;
885 for (int y = 0; y < im_h; ++y) {
886 int x_qn = subpel_x_qn;
887 for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
888 const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
889 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
890 assert(x_filter_idx < SUBPEL_SHIFTS);
891 const int16_t *x_filter =
892 av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
893 int32_t sum = (1 << (bd + FILTER_BITS - 1));
894 for (int k = 0; k < filter_params_x->taps; ++k) {
895 sum += x_filter[k] * src_x[k - fo_horiz];
896 }
897 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
898 im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
899 }
900 src_horiz += src_stride;
901 }
902
903 // vertical filter
904 int16_t * src_vert = im_block + fo_vert * im_stride;
905 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
906 for (int x = 0; x < w; ++x) {
907 int y_qn = subpel_y_qn;
908 for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
909 const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
910 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
911 assert(y_filter_idx < SUBPEL_SHIFTS);
912 const int16_t *y_filter =
913 av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
914 int32_t sum = 1 << offset_bits;
915 for (int k = 0; k < filter_params_y->taps; ++k) {
916 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
917 }
918 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
919 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
920 if (conv_params->is_compound) {
921 if (conv_params->do_average) {
922 int32_t tmp = dst16[y * dst16_stride + x];
923 if (conv_params->use_dist_wtd_comp_avg) {
924 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
925 tmp = tmp >> DIST_PRECISION_BITS;
926 } else {
927 tmp += res;
928 tmp = tmp >> 1;
929 }
930 /* Subtract round offset and convolve round */
931 tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
932 (1 << (offset_bits - conv_params->round_1 - 1)));
933 dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
934 } else {
935 dst16[y * dst16_stride + x] = res;
936 }
937 } else {
938 /* Subtract round offset and convolve round */
939 int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
940 (1 << (offset_bits - conv_params->round_1 - 1)));
941 dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
942 }
943 }
944 src_vert++;
945 }
946 }
947
svt_av1_highbd_jnt_convolve_x_c(const uint16_t * src,int32_t src_stride,uint16_t * dst16,int32_t dst16_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params,int32_t bd)948 void svt_av1_highbd_jnt_convolve_x_c(const uint16_t *src, int32_t src_stride, uint16_t *dst16,
949 int32_t dst16_stride, int32_t w, int32_t h,
950 const InterpFilterParams *filter_params_x,
951 const InterpFilterParams *filter_params_y,
952 const int32_t subpel_x_q4, const int32_t subpel_y_q4,
953 ConvolveParams *conv_params, int32_t bd) {
954 ConvBufType * dst = conv_params->dst;
955 int32_t dst_stride = conv_params->dst_stride;
956 const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
957 const int32_t bits = FILTER_BITS - conv_params->round_1;
958 const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
959 const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
960 (1 << (offset_bits - conv_params->round_1 - 1));
961 const int32_t round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
962 assert(round_bits >= 0);
963 (void)filter_params_y;
964 (void)subpel_y_q4;
965 assert(bits >= 0);
966 // horizontal filter
967 const int16_t *x_filter =
968 av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
969 for (int32_t y = 0; y < h; ++y) {
970 for (int32_t x = 0; x < w; ++x) {
971 int32_t res = 0;
972 for (int32_t k = 0; k < filter_params_x->taps; ++k)
973 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
974 res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
975 res += round_offset;
976
977 if (conv_params->do_average) {
978 int32_t tmp = dst[y * dst_stride + x];
979 if (conv_params->use_jnt_comp_avg) {
980 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
981 tmp = tmp >> DIST_PRECISION_BITS;
982 } else {
983 tmp += res;
984 tmp = tmp >> 1;
985 }
986 tmp -= round_offset;
987 dst16[y * dst16_stride + x] =
988 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
989 } else
990 dst[y * dst_stride + x] = (ConvBufType)res;
991 }
992 }
993 }
994
svt_av1_highbd_jnt_convolve_y_c(const uint16_t * src,int32_t src_stride,uint16_t * dst16,int32_t dst16_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params,int32_t bd)995 void svt_av1_highbd_jnt_convolve_y_c(const uint16_t *src, int32_t src_stride, uint16_t *dst16,
996 int32_t dst16_stride, int32_t w, int32_t h,
997 const InterpFilterParams *filter_params_x,
998 const InterpFilterParams *filter_params_y,
999 const int32_t subpel_x_q4, const int32_t subpel_y_q4,
1000 ConvolveParams *conv_params, int32_t bd) {
1001 ConvBufType * dst = conv_params->dst;
1002 int32_t dst_stride = conv_params->dst_stride;
1003 const int32_t fo_vert = filter_params_y->taps / 2 - 1;
1004 const int32_t bits = FILTER_BITS - conv_params->round_0;
1005 const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1006 const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
1007 (1 << (offset_bits - conv_params->round_1 - 1));
1008 const int32_t round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1009 assert(round_bits >= 0);
1010 (void)filter_params_x;
1011 (void)subpel_x_q4;
1012 assert(bits >= 0);
1013 // vertical filter
1014 const int16_t *y_filter =
1015 av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
1016 for (int32_t y = 0; y < h; ++y) {
1017 for (int32_t x = 0; x < w; ++x) {
1018 int32_t res = 0;
1019 for (int32_t k = 0; k < filter_params_y->taps; ++k)
1020 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
1021 res *= (1 << bits);
1022 res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
1023
1024 if (conv_params->do_average) {
1025 int32_t tmp = dst[y * dst_stride + x];
1026 if (conv_params->use_jnt_comp_avg) {
1027 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1028 tmp = tmp >> DIST_PRECISION_BITS;
1029 } else {
1030 tmp += res;
1031 tmp = tmp >> 1;
1032 }
1033 tmp -= round_offset;
1034 dst16[y * dst16_stride + x] =
1035 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
1036 } else
1037 dst[y * dst_stride + x] = (ConvBufType)res;
1038 }
1039 }
1040 }
1041
svt_av1_highbd_jnt_convolve_2d_copy_c(const uint16_t * src,int32_t src_stride,uint16_t * dst16,int32_t dst16_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params,int32_t bd)1042 void svt_av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int32_t src_stride, uint16_t *dst16,
1043 int32_t dst16_stride, int32_t w, int32_t h,
1044 const InterpFilterParams *filter_params_x,
1045 const InterpFilterParams *filter_params_y,
1046 const int32_t subpel_x_q4, const int32_t subpel_y_q4,
1047 ConvolveParams *conv_params, int32_t bd) {
1048 ConvBufType * dst = conv_params->dst;
1049 int32_t dst_stride = conv_params->dst_stride;
1050 const int32_t bits = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
1051 const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1052 const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
1053 (1 << (offset_bits - conv_params->round_1 - 1));
1054 assert(bits >= 0);
1055 (void)filter_params_x;
1056 (void)filter_params_y;
1057 (void)subpel_x_q4;
1058 (void)subpel_y_q4;
1059
1060 for (int32_t y = 0; y < h; ++y) {
1061 for (int32_t x = 0; x < w; ++x) {
1062 ConvBufType res = src[y * src_stride + x] << bits;
1063 res += (ConvBufType)round_offset;
1064 if (conv_params->do_average) {
1065 int32_t tmp = dst[y * dst_stride + x];
1066 if (conv_params->use_jnt_comp_avg) {
1067 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1068 tmp = tmp >> DIST_PRECISION_BITS;
1069 } else {
1070 tmp += res;
1071 tmp = tmp >> 1;
1072 }
1073 tmp -= round_offset;
1074 dst16[y * dst16_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1075 } else
1076 dst[y * dst_stride + x] = res;
1077 }
1078 }
1079 }
1080
svt_av1_highbd_jnt_convolve_2d_c(const uint16_t * src,int32_t src_stride,uint16_t * dst16,int32_t dst16_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params,int32_t bd)1081 void svt_av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int32_t src_stride, uint16_t *dst16,
1082 int32_t dst16_stride, int32_t w, int32_t h,
1083 const InterpFilterParams *filter_params_x,
1084 const InterpFilterParams *filter_params_y,
1085 const int32_t subpel_x_q4, const int32_t subpel_y_q4,
1086 ConvolveParams *conv_params, int32_t bd)
1087
1088 {
1089 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
1090 ConvBufType * dst = conv_params->dst;
1091 int32_t dst_stride = conv_params->dst_stride;
1092 int32_t im_h = h + filter_params_y->taps - 1;
1093 int32_t im_stride = w;
1094 const int32_t fo_vert = filter_params_y->taps / 2 - 1;
1095 const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
1096
1097 const int32_t round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1098 assert(round_bits >= 0);
1099
1100 // horizontal filter
1101 const uint16_t *src_horiz = src - fo_vert * src_stride;
1102 const int16_t * x_filter =
1103 av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
1104 for (int y = 0; y < im_h; ++y) {
1105 for (int x = 0; x < w; ++x) {
1106 int32_t sum = (1 << (bd + FILTER_BITS - 1));
1107 for (int k = 0; k < filter_params_x->taps; ++k)
1108 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
1109 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
1110 (void)bd;
1111 im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
1112 }
1113 }
1114
1115 // vertical filter
1116 int16_t * src_vert = im_block + fo_vert * im_stride;
1117 const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1118 const int16_t *y_filter =
1119 av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
1120 for (int y = 0; y < h; ++y) {
1121 for (int x = 0; x < w; ++x) {
1122 int32_t sum = 1 << offset_bits;
1123 for (int k = 0; k < filter_params_y->taps; ++k)
1124 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
1125 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
1126 ConvBufType res = (ConvBufType)ROUND_POWER_OF_TWO(sum, conv_params->round_1);
1127 if (conv_params->do_average) {
1128 int32_t tmp = dst[y * dst_stride + x];
1129 if (conv_params->use_jnt_comp_avg) {
1130 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1131 tmp = tmp >> DIST_PRECISION_BITS;
1132 } else {
1133 tmp += res;
1134 tmp = tmp >> 1;
1135 }
1136 tmp -= (1 << (offset_bits - conv_params->round_1)) +
1137 (1 << (offset_bits - conv_params->round_1 - 1));
1138 dst16[y * dst16_stride + x] =
1139 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
1140 } else
1141 dst[y * dst_stride + x] = res;
1142 }
1143 }
1144 }
1145
1146 aom_highbd_convolve_fn_t convolveHbd[/*subX*/ 2][/*subY*/ 2][/*bi*/ 2];
asm_set_convolve_hbd_asm_table(void)1147 void asm_set_convolve_hbd_asm_table(void) {
1148 convolveHbd[0][0][0] = svt_av1_highbd_convolve_2d_copy_sr;
1149 convolveHbd[0][0][1] = svt_av1_highbd_jnt_convolve_2d_copy;
1150
1151 convolveHbd[0][1][0] = svt_av1_highbd_convolve_y_sr;
1152 convolveHbd[0][1][1] = svt_av1_highbd_jnt_convolve_y;
1153
1154 convolveHbd[1][0][0] = svt_av1_highbd_convolve_x_sr;
1155 convolveHbd[1][0][1] = svt_av1_highbd_jnt_convolve_x;
1156
1157 convolveHbd[1][1][0] = svt_av1_highbd_convolve_2d_sr;
1158 convolveHbd[1][1][1] = svt_av1_highbd_jnt_convolve_2d;
1159 }
1160
1161 AomConvolveFn convolve[/*subX*/ 2][/*subY*/ 2][/*bi*/ 2];
asm_set_convolve_asm_table(void)1162 void asm_set_convolve_asm_table(void) {
1163 convolve[0][0][0] = svt_av1_convolve_2d_copy_sr;
1164 convolve[0][0][1] = svt_av1_jnt_convolve_2d_copy;
1165
1166 convolve[0][1][0] = svt_av1_convolve_y_sr;
1167 convolve[0][1][1] = svt_av1_jnt_convolve_y;
1168
1169 convolve[1][0][0] = svt_av1_convolve_x_sr;
1170 convolve[1][0][1] = svt_av1_jnt_convolve_x;
1171
1172 convolve[1][1][0] = svt_av1_convolve_2d_sr;
1173 convolve[1][1][1] = svt_av1_jnt_convolve_2d;
1174 }
1175
1176 InterpFilterParams av1RegularFilter = {
1177 (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS, EIGHTTAP_REGULAR};
1178 InterpFilterParams av1RegularFilterW4 = {
1179 (const int16_t *)sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS, EIGHTTAP_REGULAR};
1180
1181 DECLARE_ALIGNED(256, const InterpKernel, sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
1182 {0, 0, 0, 128, 0, 0, 0, 0},
1183 {-2, 2, -6, 126, 8, -2, 2, 0},
1184 {-2, 6, -12, 124, 16, -6, 4, -2},
1185 {-2, 8, -18, 120, 26, -10, 6, -2},
1186 {-4, 10, -22, 116, 38, -14, 6, -2},
1187 {-4, 10, -22, 108, 48, -18, 8, -2},
1188 {-4, 10, -24, 100, 60, -20, 8, -2},
1189 {-4, 10, -24, 90, 70, -22, 10, -2},
1190 {-4, 12, -24, 80, 80, -24, 12, -4},
1191 {-2, 10, -22, 70, 90, -24, 10, -4},
1192 {-2, 8, -20, 60, 100, -24, 10, -4},
1193 {-2, 8, -18, 48, 108, -22, 10, -4},
1194 {-2, 6, -14, 38, 116, -22, 10, -4},
1195 {-2, 6, -10, 26, 120, -18, 8, -2},
1196 {-2, 4, -6, 16, 124, -12, 6, -2},
1197 {0, 2, -2, 8, 126, -6, 2, -2}};
1198
1199 DECLARE_ALIGNED(256, const InterpKernel, sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
1200 {0, 0, 0, 128, 0, 0, 0, 0},
1201 {0, 2, 28, 62, 34, 2, 0, 0},
1202 {0, 0, 26, 62, 36, 4, 0, 0},
1203 {0, 0, 22, 62, 40, 4, 0, 0},
1204 {0, 0, 20, 60, 42, 6, 0, 0},
1205 {0, 0, 18, 58, 44, 8, 0, 0},
1206 {0, 0, 16, 56, 46, 10, 0, 0},
1207 {0, -2, 16, 54, 48, 12, 0, 0},
1208 {0, -2, 14, 52, 52, 14, -2, 0},
1209 {0, 0, 12, 48, 54, 16, -2, 0},
1210 {0, 0, 10, 46, 56, 16, 0, 0},
1211 {0, 0, 8, 44, 58, 18, 0, 0},
1212 {0, 0, 6, 42, 60, 20, 0, 0},
1213 {0, 0, 4, 40, 62, 22, 0, 0},
1214 {0, 0, 4, 36, 62, 26, 0, 0},
1215 {0, 0, 2, 34, 62, 28, 2, 0}};
1216 DECLARE_ALIGNED(256, const InterpKernel, bilinear_filters[SUBPEL_SHIFTS]) = {
1217 {0, 0, 0, 128, 0, 0, 0, 0},
1218 {0, 0, 0, 120, 8, 0, 0, 0},
1219 {0, 0, 0, 112, 16, 0, 0, 0},
1220 {0, 0, 0, 104, 24, 0, 0, 0},
1221 {0, 0, 0, 96, 32, 0, 0, 0},
1222 {0, 0, 0, 88, 40, 0, 0, 0},
1223 {0, 0, 0, 80, 48, 0, 0, 0},
1224 {0, 0, 0, 72, 56, 0, 0, 0},
1225 {0, 0, 0, 64, 64, 0, 0, 0},
1226 {0, 0, 0, 56, 72, 0, 0, 0},
1227 {0, 0, 0, 48, 80, 0, 0, 0},
1228 {0, 0, 0, 40, 88, 0, 0, 0},
1229 {0, 0, 0, 32, 96, 0, 0, 0},
1230 {0, 0, 0, 24, 104, 0, 0, 0},
1231 {0, 0, 0, 16, 112, 0, 0, 0},
1232 {0, 0, 0, 8, 120, 0, 0, 0}};
1233 DECLARE_ALIGNED(256, const InterpKernel, sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {
1234 {0, 0, 0, 128, 0, 0, 0, 0},
1235 {0, 0, 30, 62, 34, 2, 0, 0},
1236 {0, 0, 26, 62, 36, 4, 0, 0},
1237 {0, 0, 22, 62, 40, 4, 0, 0},
1238 {0, 0, 20, 60, 42, 6, 0, 0},
1239 {0, 0, 18, 58, 44, 8, 0, 0},
1240 {0, 0, 16, 56, 46, 10, 0, 0},
1241 {0, 0, 14, 54, 48, 12, 0, 0},
1242 {0, 0, 12, 52, 52, 12, 0, 0},
1243 {0, 0, 12, 48, 54, 14, 0, 0},
1244 {0, 0, 10, 46, 56, 16, 0, 0},
1245 {0, 0, 8, 44, 58, 18, 0, 0},
1246 {0, 0, 6, 42, 60, 20, 0, 0},
1247 {0, 0, 4, 40, 62, 22, 0, 0},
1248 {0, 0, 4, 36, 62, 26, 0, 0},
1249 {0, 0, 2, 34, 62, 30, 0, 0}};
1250 BlockSize scale_chroma_bsize(BlockSize bsize, int32_t subsampling_x, int32_t subsampling_y);
1251
convolve_2d_for_intrabc(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,int subpel_x_q4,int subpel_y_q4,ConvolveParams * conv_params)1252 void convolve_2d_for_intrabc(const uint8_t *src, int src_stride, uint8_t *dst,
1253 int dst_stride, int w, int h, int subpel_x_q4, int subpel_y_q4,
1254 ConvolveParams *conv_params) {
1255 const InterpFilterParams *filter_params_x =
1256 subpel_x_q4 ? &av1_interp_filter_params_list[BILINEAR] : NULL;
1257 const InterpFilterParams *filter_params_y =
1258 subpel_y_q4 ? &av1_interp_filter_params_list[BILINEAR] : NULL;
1259 if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
1260 svt_av1_convolve_2d_sr(src,
1261 src_stride,
1262 dst,
1263 dst_stride,
1264 w,
1265 h,
1266 (InterpFilterParams *)filter_params_x,
1267 (InterpFilterParams *)filter_params_y,
1268 8,
1269 8,
1270 conv_params);
1271 } else if (subpel_x_q4 != 0) {
1272 svt_av1_convolve_x_sr(src,
1273 src_stride,
1274 dst,
1275 dst_stride,
1276 w,
1277 h,
1278 (InterpFilterParams *)filter_params_x,
1279 (InterpFilterParams *)filter_params_y,
1280 8,
1281 0,
1282 conv_params);
1283 } else {
1284 svt_av1_convolve_y_sr(src,
1285 src_stride,
1286 dst,
1287 dst_stride,
1288 w,
1289 h,
1290 (InterpFilterParams *)filter_params_x,
1291 (InterpFilterParams *)filter_params_y,
1292 0,
1293 8,
1294 conv_params);
1295 }
1296 }
highbd_convolve_2d_for_intrabc(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,int subpel_x_q4,int subpel_y_q4,ConvolveParams * conv_params,int bd)1297 void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride, uint16_t *dst,
1298 int dst_stride, int w, int h, int subpel_x_q4,
1299 int subpel_y_q4, ConvolveParams *conv_params, int bd) {
1300 const InterpFilterParams *filter_params_x =
1301 subpel_x_q4 ? &av1_interp_filter_params_list[BILINEAR] : NULL;
1302 const InterpFilterParams *filter_params_y =
1303 subpel_y_q4 ? &av1_interp_filter_params_list[BILINEAR] : NULL;
1304 if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
1305 svt_av1_highbd_convolve_2d_sr(src,
1306 src_stride,
1307 dst,
1308 dst_stride,
1309 w,
1310 h,
1311 filter_params_x,
1312 filter_params_y,
1313 8,
1314 8,
1315 conv_params,
1316 bd);
1317 } else if (subpel_x_q4 != 0) {
1318 svt_av1_highbd_convolve_x_sr(src,
1319 src_stride,
1320 dst,
1321 dst_stride,
1322 w,
1323 h,
1324 filter_params_x,
1325 filter_params_y,
1326 8,
1327 0,
1328 conv_params,
1329 bd);
1330 } else {
1331 svt_av1_highbd_convolve_y_sr(src,
1332 src_stride,
1333 dst,
1334 dst_stride,
1335 w,
1336 h,
1337 filter_params_x,
1338 filter_params_y,
1339 0,
1340 8,
1341 conv_params,
1342 bd);
1343 }
1344 }
1345
svt_inter_predictor(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const SubpelParams * subpel_params,const ScaleFactors * sf,int32_t w,int32_t h,ConvolveParams * conv_params,InterpFilters interp_filters,int32_t is_intrabc)1346 void svt_inter_predictor(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
1347 const SubpelParams *subpel_params, const ScaleFactors *sf, int32_t w,
1348 int32_t h, ConvolveParams *conv_params, InterpFilters interp_filters,
1349 int32_t is_intrabc) {
1350 InterpFilterParams filter_params_x, filter_params_y;
1351 const int32_t is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
1352
1353 av1_get_convolve_filter_params(interp_filters, &filter_params_x, &filter_params_y, w, h);
1354
1355 assert(conv_params->do_average == 0 || conv_params->do_average == 1);
1356 assert(sf);
1357 UNUSED(sf);
1358 assert(IMPLIES(is_intrabc, !is_scaled));
1359
1360 if (is_scaled) {
1361 if (is_intrabc && (subpel_params->subpel_x != 0 || subpel_params->subpel_y != 0)) {
1362 convolve_2d_for_intrabc(src,
1363 src_stride,
1364 dst,
1365 dst_stride,
1366 w,
1367 h,
1368 subpel_params->subpel_x,
1369 subpel_params->subpel_y,
1370 conv_params);
1371 return;
1372 }
1373 if (conv_params->is_compound) { assert(conv_params->dst != NULL); }
1374 svt_av1_convolve_2d_scale(src,
1375 src_stride,
1376 dst,
1377 dst_stride,
1378 w,
1379 h,
1380 &filter_params_x,
1381 &filter_params_y,
1382 subpel_params->subpel_x,
1383 subpel_params->xs,
1384 subpel_params->subpel_y,
1385 subpel_params->ys,
1386 conv_params);
1387 } else {
1388 SubpelParams sp = *subpel_params;
1389 revert_scale_extra_bits(&sp);
1390
1391 if (is_intrabc && (sp.subpel_x != 0 || sp.subpel_y != 0)) {
1392 convolve_2d_for_intrabc(
1393 src, src_stride, dst, dst_stride, w, h, sp.subpel_x, sp.subpel_y, conv_params);
1394 return;
1395 }
1396
1397 convolve[sp.subpel_x != 0][sp.subpel_y != 0][conv_params->is_compound](src,
1398 src_stride,
1399 dst,
1400 dst_stride,
1401 w,
1402 h,
1403 &filter_params_x,
1404 &filter_params_y,
1405 sp.subpel_x,
1406 sp.subpel_y,
1407 conv_params);
1408 }
1409 }
1410
svt_highbd_inter_predictor(const uint16_t * src,int32_t src_stride,uint16_t * dst,int32_t dst_stride,const SubpelParams * subpel_params,const ScaleFactors * sf,int32_t w,int32_t h,ConvolveParams * conv_params,InterpFilters interp_filters,int32_t is_intrabc,int32_t bd)1411 void svt_highbd_inter_predictor(const uint16_t *src, int32_t src_stride, uint16_t *dst,
1412 int32_t dst_stride, const SubpelParams *subpel_params,
1413 const ScaleFactors *sf, int32_t w, int32_t h,
1414 ConvolveParams *conv_params, InterpFilters interp_filters,
1415 int32_t is_intrabc, int32_t bd) {
1416 InterpFilterParams filter_params_x, filter_params_y;
1417 const int32_t is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
1418
1419 av1_get_convolve_filter_params(interp_filters, &filter_params_x, &filter_params_y, w, h);
1420
1421 assert(conv_params->do_average == 0 || conv_params->do_average == 1);
1422 assert(sf);
1423 UNUSED(sf);
1424 assert(IMPLIES(is_intrabc, !is_scaled));
1425
1426 if (is_scaled) {
1427 if (is_intrabc && (subpel_params->subpel_x != 0 || subpel_params->subpel_y != 0)) {
1428 highbd_convolve_2d_for_intrabc(src,
1429 src_stride,
1430 dst,
1431 dst_stride,
1432 w,
1433 h,
1434 subpel_params->subpel_x,
1435 subpel_params->subpel_y,
1436 conv_params,
1437 bd);
1438 return;
1439 }
1440 if (conv_params->is_compound) { assert(conv_params->dst != NULL); }
1441 svt_av1_highbd_convolve_2d_scale(src,
1442 src_stride,
1443 dst,
1444 dst_stride,
1445 w,
1446 h,
1447 &filter_params_x,
1448 &filter_params_y,
1449 subpel_params->subpel_x,
1450 subpel_params->xs,
1451 subpel_params->subpel_y,
1452 subpel_params->ys,
1453 conv_params,
1454 bd);
1455 } else {
1456 SubpelParams sp = *subpel_params;
1457 revert_scale_extra_bits(&sp);
1458
1459 if (is_intrabc && (sp.subpel_x != 0 || sp.subpel_y != 0)) {
1460 highbd_convolve_2d_for_intrabc(
1461 src, src_stride, dst, dst_stride, w, h, sp.subpel_x, sp.subpel_y, conv_params, bd);
1462 return;
1463 }
1464
1465 convolveHbd[sp.subpel_x != 0][sp.subpel_y != 0][conv_params->is_compound](src,
1466 src_stride,
1467 dst,
1468 dst_stride,
1469 w,
1470 h,
1471 &filter_params_x,
1472 &filter_params_y,
1473 sp.subpel_x,
1474 sp.subpel_y,
1475 conv_params,
1476 bd);
1477 }
1478 }
1479
1480
1481 #define USE_PRECOMPUTED_WEDGE_SIGN 1
1482 #define USE_PRECOMPUTED_WEDGE_MASK 1
1483
1484 #if USE_PRECOMPUTED_WEDGE_MASK
1485 static const uint8_t wedge_primary_oblique_odd[MASK_PRIMARY_SIZE] = {
1486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1487 0, 0, 0, 0, 0, 0, 1, 2, 6, 18, 37, 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64,
1488 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
1489 };
1490 static const uint8_t wedge_primary_oblique_even[MASK_PRIMARY_SIZE] = {
1491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1492 0, 0, 0, 0, 0, 0, 1, 4, 11, 27, 46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64,
1493 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
1494 };
1495 static const uint8_t wedge_primary_vertical[MASK_PRIMARY_SIZE] = {
1496 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1497 0, 0, 0, 0, 0, 0, 0, 2, 7, 21, 43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64,
1498 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
1499 };
1500
1501
1502 DECLARE_ALIGNED(16, static uint8_t, wedge_signflip_lookup[BlockSizeS_ALL][MAX_WEDGE_TYPES]) = {
1503 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
1504 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
1505 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
1506 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
1507 { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
1508 { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
1509 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
1510 { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
1511 { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
1512 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
1513 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
1514 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
1515 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
1516 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
1517 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
1518 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
1519 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
1520 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
1521 { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, },
1522 { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, },
1523 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
1524 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
1525 };
1526
1527
1528 static const WedgeCodeType wedge_codebook_16_hgtw[16] = {
1529 { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
1530 { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
1531 { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
1532 { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
1533 { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
1534 { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
1535 { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
1536 { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
1537 };
1538
1539 static const WedgeCodeType wedge_codebook_16_hltw[16] = {
1540 { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
1541 { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
1542 { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 4, 4 },
1543 { WEDGE_VERTICAL, 6, 4 }, { WEDGE_HORIZONTAL, 4, 4 },
1544 { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
1545 { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
1546 { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
1547 { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
1548 };
1549
1550 static const WedgeCodeType wedge_codebook_16_heqw[16] = {
1551 { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
1552 { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
1553 { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
1554 { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 6, 4 },
1555 { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
1556 { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
1557 { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
1558 { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
1559 };
1560
1561 static const WedgeParamsType wedge_params_lookup[BlockSizeS_ALL] = {
1562 { 0, NULL, NULL, NULL },
1563 { 0, NULL, NULL, NULL },
1564 { 0, NULL, NULL, NULL },
1565 { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8],
1566 wedge_masks[BLOCK_8X8] },
1567 { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16],
1568 wedge_masks[BLOCK_8X16] },
1569 { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8],
1570 wedge_masks[BLOCK_16X8] },
1571 { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16],
1572 wedge_masks[BLOCK_16X16] },
1573 { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32],
1574 wedge_masks[BLOCK_16X32] },
1575 { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16],
1576 wedge_masks[BLOCK_32X16] },
1577 { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32],
1578 wedge_masks[BLOCK_32X32] },
1579 { 0, NULL, NULL, NULL },
1580 { 0, NULL, NULL, NULL },
1581 { 0, NULL, NULL, NULL },
1582 { 0, NULL, NULL, NULL },
1583 { 0, NULL, NULL, NULL },
1584 { 0, NULL, NULL, NULL },
1585 { 0, NULL, NULL, NULL },
1586 { 0, NULL, NULL, NULL },
1587 { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32],
1588 wedge_masks[BLOCK_8X32] },
1589 { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8],
1590 wedge_masks[BLOCK_32X8] },
1591 { 0, NULL, NULL, NULL },
1592 { 0, NULL, NULL, NULL },
1593 };
1594
is_interintra_wedge_used(BlockSize sb_type)1595 int is_interintra_wedge_used(BlockSize sb_type) {
1596 return wedge_params_lookup[sb_type].bits > 0;
1597 }
1598
get_wedge_bits_lookup(BlockSize sb_type)1599 int32_t get_wedge_bits_lookup(BlockSize sb_type) {
1600 return wedge_params_lookup[sb_type].bits;
1601 }
1602
av1_get_contiguous_soft_mask(int wedge_index,int wedge_sign,BlockSize sb_type)1603 const uint8_t *av1_get_contiguous_soft_mask(int wedge_index, int wedge_sign,
1604 BlockSize sb_type) {
1605 return wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
1606 }
1607
aom_convolve_copy_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)1608 static void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1609 ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride,
1610 const int16_t *filter_y, int filter_y_stride, int w, int h) {
1611
1612 (void)filter_x;
1613 (void)filter_x_stride;
1614 (void)filter_y;
1615 (void)filter_y_stride;
1616
1617 for (int r = h; r > 0; --r) {
1618 svt_memcpy(dst, src, w);
1619 src += src_stride;
1620 dst += dst_stride;
1621 }
1622 }
1623
shift_copy(const uint8_t * src,uint8_t * dst,int shift,int width)1624 static void shift_copy(const uint8_t *src, uint8_t *dst, int shift, int width) {
1625 if (shift >= 0) {
1626 svt_memcpy(dst + shift, src, width - shift);
1627 memset(dst, src[0], shift);
1628 } else {
1629 shift = -shift;
1630 svt_memcpy(dst, src + shift, width - shift);
1631 memset(dst + width - shift, src[width - 1], shift);
1632 }
1633 }
1634
get_wedge_params_bits(BlockSize sb_type)1635 int get_wedge_params_bits(BlockSize sb_type) {
1636 return wedge_params_lookup[sb_type].bits;
1637 }
1638
1639 #endif // USE_PRECOMPUTED_WEDGE_MASK
1640
1641
1642 // [negative][direction]
1643 DECLARE_ALIGNED(16, static uint8_t,
1644 wedge_mask_obl[2][WEDGE_DIRECTIONS][MASK_PRIMARY_SIZE * MASK_PRIMARY_SIZE]);
1645
1646 // 4 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound
1647 // on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE.
1648 DECLARE_ALIGNED(16, static uint8_t, wedge_mask_buf[2 * MAX_WEDGE_TYPES * 4 * MAX_WEDGE_SQUARE]);
1649
init_wedge_primary_masks()1650 static void init_wedge_primary_masks() {
1651 const int w = MASK_PRIMARY_SIZE;
1652 const int h = MASK_PRIMARY_SIZE;
1653 const int stride = MASK_PRIMARY_STRIDE;
1654 // Note: index [0] stores the primary, and [1] its complement.
1655 #if USE_PRECOMPUTED_WEDGE_MASK
1656 // Generate prototype by shifting the primary
1657 int shift = h / 4;
1658 for (int i = 0; i < h; i += 2) {
1659 shift_copy(wedge_primary_oblique_even,
1660 &wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride],
1661 shift,
1662 MASK_PRIMARY_SIZE);
1663 shift--;
1664 shift_copy(wedge_primary_oblique_odd,
1665 &wedge_mask_obl[0][WEDGE_OBLIQUE63][(i + 1) * stride],
1666 shift,
1667 MASK_PRIMARY_SIZE);
1668 svt_memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][i * stride],
1669 wedge_primary_vertical,
1670 MASK_PRIMARY_SIZE * sizeof(wedge_primary_vertical[0]));
1671 svt_memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][(i + 1) * stride],
1672 wedge_primary_vertical,
1673 MASK_PRIMARY_SIZE * sizeof(wedge_primary_vertical[0]));
1674 }
1675 #else
1676 static const double smoother_param = 2.85;
1677 const int a[2] = {2, 1};
1678 const double asqrt = sqrt(a[0] * a[0] + a[1] * a[1]);
1679 for (int i = 0; i < h; i++) {
1680 for (int j = 0; j < w; ++j) {
1681 int x = (2 * j + 1 - w);
1682 int y = (2 * i + 1 - h);
1683 double d = (a[0] * x + a[1] * y) / asqrt;
1684 const int msk = (int)rint((1.0 + tanh(d / smoother_param)) * 32);
1685 wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j] = msk;
1686 const int mskx = (int)rint((1.0 + tanh(x / smoother_param)) * 32);
1687 wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j] = mskx;
1688 }
1689 }
1690 #endif // USE_PRECOMPUTED_WEDGE_MASK
1691 for (int i = 0; i < h; ++i) {
1692 for (int j = 0; j < w; ++j) {
1693 const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j];
1694 wedge_mask_obl[0][WEDGE_OBLIQUE27][j * stride + i] = msk;
1695 wedge_mask_obl[0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
1696 wedge_mask_obl[0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
1697 (1 << WEDGE_WEIGHT_BITS) - msk;
1698 wedge_mask_obl[1][WEDGE_OBLIQUE63][i * stride + j] =
1699 wedge_mask_obl[1][WEDGE_OBLIQUE27][j * stride + i] = (1 << WEDGE_WEIGHT_BITS) - msk;
1700 wedge_mask_obl[1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
1701 wedge_mask_obl[1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = msk;
1702 const int mskx = wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j];
1703 wedge_mask_obl[0][WEDGE_HORIZONTAL][j * stride + i] = mskx;
1704 wedge_mask_obl[1][WEDGE_VERTICAL][i * stride + j] =
1705 wedge_mask_obl[1][WEDGE_HORIZONTAL][j * stride + i] =
1706 (1 << WEDGE_WEIGHT_BITS) - mskx;
1707 }
1708 }
1709 }
1710
1711 #if !USE_PRECOMPUTED_WEDGE_SIGN
1712 // If the signs for the wedges for various blocksizes are
1713 // inconsistent flip the sign flag. Do it only once for every
1714 // wedge codebook.
init_wedge_signs()1715 static void init_wedge_signs() {
1716 memset(wedge_signflip_lookup, 0, sizeof(wedge_signflip_lookup));
1717 for (BLOCK_SIZE sb_type = BLOCK_4X4; sb_type < BLOCK_SIZES_ALL; ++sb_type) {
1718 const int bw = block_size_wide[sb_type];
1719 const int bh = block_size_high[sb_type];
1720 const wedge_params_type wedge_params = wedge_params_lookup[sb_type];
1721 const int wbits = wedge_params.bits;
1722 const int wtypes = 1 << wbits;
1723
1724 if (wbits) {
1725 for (int w = 0; w < wtypes; ++w) {
1726 // Get the mask primary, i.e. index [0]
1727 const uint8_t *mask = get_wedge_mask_inplace(w, 0, sb_type);
1728 int avg = 0;
1729 for (int i = 0; i < bw; ++i) avg += mask[i];
1730 for (int i = 1; i < bh; ++i) avg += mask[i * MASK_PRIMARY_STRIDE];
1731 avg = (avg + (bw + bh - 1) / 2) / (bw + bh - 1);
1732 // Default sign of this wedge is 1 if the average < 32, 0 otherwise.
1733 // If default sign is 1:
1734 // If sign requested is 0, we need to flip the sign and return
1735 // the complement i.e. index [1] instead. If sign requested is 1
1736 // we need to flip the sign and return index [0] instead.
1737 // If default sign is 0:
1738 // If sign requested is 0, we need to return index [0] the primary
1739 // if sign requested is 1, we need to return the complement index [1]
1740 // instead.
1741 wedge_params.signflip[w] = (avg < 32);
1742 }
1743 }
1744 }
1745 }
1746 #endif // !USE_PRECOMPUTED_WEDGE_SIGN
1747
get_wedge_mask_inplace(int wedge_index,int neg,BlockSize sb_type)1748 static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg, BlockSize sb_type) {
1749 const int bh = block_size_high[sb_type];
1750 const int bw = block_size_wide[sb_type];
1751
1752 assert(wedge_index >= 0 && wedge_index < (1 << get_wedge_bits_lookup(sb_type)));
1753 const WedgeCodeType *a = wedge_params_lookup[sb_type].codebook + wedge_index;
1754 int woff, hoff;
1755 const uint8_t wsignflip = wedge_params_lookup[sb_type].signflip[wedge_index];
1756
1757 woff = (a->x_offset * bw) >> 3;
1758 hoff = (a->y_offset * bh) >> 3;
1759 return wedge_mask_obl[neg ^ wsignflip][a->direction] +
1760 MASK_PRIMARY_STRIDE * (MASK_PRIMARY_SIZE / 2 - hoff) + MASK_PRIMARY_SIZE / 2 - woff;
1761 }
1762
init_wedge_masks()1763 static void init_wedge_masks() {
1764 uint8_t * dst = wedge_mask_buf;
1765 memset(wedge_masks, 0, sizeof(wedge_masks));
1766 for (BlockSize bsize = BLOCK_4X4; bsize < BlockSizeS_ALL; ++bsize) {
1767 const int bw = block_size_wide[bsize];
1768 const int bh = block_size_high[bsize];
1769 const WedgeParamsType *wedge_params = &wedge_params_lookup[bsize];
1770 const int wbits = wedge_params->bits;
1771 const int wtypes = 1 << wbits;
1772 if (wbits == 0) continue;
1773 for (int w = 0; w < wtypes; ++w) {
1774 const uint8_t *mask;
1775 mask = get_wedge_mask_inplace(w, 0, bsize);
1776 aom_convolve_copy_c(mask, MASK_PRIMARY_STRIDE, dst, bw, NULL, 0, NULL, 0, bw, bh);
1777 wedge_params->masks[0][w] = dst;
1778 dst += bw * bh;
1779
1780 mask = get_wedge_mask_inplace(w, 1, bsize);
1781 aom_convolve_copy_c(mask, MASK_PRIMARY_STRIDE, dst, bw, NULL, 0, NULL, 0, bw, bh);
1782 wedge_params->masks[1][w] = dst;
1783 dst += bw * bh;
1784 }
1785 assert(sizeof(wedge_mask_buf) >= (size_t)(dst - wedge_mask_buf));
1786 }
1787 }
1788
1789 // Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
svt_av1_init_wedge_masks(void)1790 void svt_av1_init_wedge_masks(void) {
1791 init_wedge_primary_masks();
1792 #if !USE_PRECOMPUTED_WEDGE_SIGN
1793 init_wedge_signs();
1794 #endif // !USE_PRECOMPUTED_WEDGE_SIGN
1795 init_wedge_masks();
1796 }
1797
1798 int is_masked_compound_type(COMPOUND_TYPE type);
1799
1800 /* clang-format off */
1801 static const uint8_t ii_weights1d[MAX_SB_SIZE] = {
1802 60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32,
1803 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16,
1804 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8,
1805 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 4, 4,
1806 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2,
1807 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
1808 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
1809 };
1810 static uint8_t ii_size_scales[BlockSizeS_ALL] = {
1811 32, 16, 16, 16, 8, 8, 8, 4,
1812 4, 4, 2, 2, 2, 1, 1, 1,
1813 8, 8, 4, 4, 2, 2
1814 };
1815 /* clang-format on */
1816
build_smooth_interintra_mask(uint8_t * mask,int stride,BlockSize plane_bsize,InterIntraMode mode)1817 void build_smooth_interintra_mask(uint8_t *mask, int stride, BlockSize plane_bsize,
1818 InterIntraMode mode) {
1819 const int bw = block_size_wide[plane_bsize];
1820 const int bh = block_size_high[plane_bsize];
1821 const int size_scale = ii_size_scales[plane_bsize];
1822
1823 switch (mode) {
1824 case II_V_PRED:
1825 for (int i = 0; i < bh; ++i) {
1826 memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0]));
1827 mask += stride;
1828 }
1829 break;
1830
1831 case II_H_PRED:
1832 for (int i = 0; i < bh; ++i) {
1833 for (int j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale];
1834 mask += stride;
1835 }
1836 break;
1837
1838 case II_SMOOTH_PRED:
1839 for (int i = 0; i < bh; ++i) {
1840 for (int j = 0; j < bw; ++j) mask[j] = ii_weights1d[(i < j ? i : j) * size_scale];
1841 mask += stride;
1842 }
1843 break;
1844
1845 case II_DC_PRED:
1846 default:
1847 for (int i = 0; i < bh; ++i) {
1848 memset(mask, 32, bw * sizeof(mask[0]));
1849 mask += stride;
1850 }
1851 break;
1852 }
1853 }
1854
combine_interintra_highbd(InterIntraMode mode,uint8_t use_wedge_interintra,uint8_t wedge_index,uint8_t wedge_sign,BlockSize bsize,BlockSize plane_bsize,uint8_t * comppred8,int compstride,const uint8_t * interpred8,int interstride,const uint8_t * intrapred8,int intrastride,int bd)1855 void combine_interintra_highbd(InterIntraMode mode, uint8_t use_wedge_interintra,
1856 uint8_t wedge_index, uint8_t wedge_sign, BlockSize bsize,
1857 BlockSize plane_bsize, uint8_t *comppred8, int compstride,
1858 const uint8_t *interpred8, int interstride,
1859 const uint8_t *intrapred8, int intrastride, int bd) {
1860 const int bw = block_size_wide[plane_bsize];
1861 const int bh = block_size_high[plane_bsize];
1862
1863 if (use_wedge_interintra) {
1864 if (is_interintra_wedge_used(bsize)) {
1865 const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
1866 const int subh = 2 * mi_size_high[bsize] == bh;
1867 const int subw = 2 * mi_size_wide[bsize] == bw;
1868 svt_aom_highbd_blend_a64_mask(comppred8,
1869 compstride,
1870 intrapred8,
1871 intrastride,
1872 interpred8,
1873 interstride,
1874 mask,
1875 block_size_wide[bsize],
1876 bw,
1877 bh,
1878 subw,
1879 subh,
1880 bd);
1881 }
1882 return;
1883 }
1884
1885 uint8_t mask[MAX_SB_SQUARE];
1886 build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
1887 svt_aom_highbd_blend_a64_mask(comppred8,
1888 compstride,
1889 intrapred8,
1890 intrastride,
1891 interpred8,
1892 interstride,
1893 mask,
1894 bw,
1895 bw,
1896 bh,
1897 0,
1898 0,
1899 bd);
1900 }
1901
av1_get_compound_type_mask(const InterInterCompoundData * const comp_data,uint8_t * seg_mask,BlockSize sb_type)1902 static const uint8_t *av1_get_compound_type_mask(const InterInterCompoundData *const comp_data,
1903 uint8_t *seg_mask, BlockSize sb_type) {
1904 assert(is_masked_compound_type(comp_data->type));
1905 (void)sb_type;
1906 switch (comp_data->type) {
1907 case COMPOUND_WEDGE:
1908 return av1_get_contiguous_soft_mask(comp_data->wedge_index, comp_data->wedge_sign, sb_type);
1909 case COMPOUND_DIFFWTD: return seg_mask;
1910 default: assert(0); return NULL;
1911 }
1912 }
1913
build_masked_compound_no_round(uint8_t * dst,int dst_stride,const CONV_BUF_TYPE * src0,int src0_stride,const CONV_BUF_TYPE * src1,int src1_stride,const InterInterCompoundData * const comp_data,uint8_t * seg_mask,BlockSize sb_type,int h,int w,ConvolveParams * conv_params,uint8_t bit_depth,EbBool is_16bit)1914 void build_masked_compound_no_round(uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0,
1915 int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride,
1916 const InterInterCompoundData *const comp_data,
1917 uint8_t *seg_mask, BlockSize sb_type, int h, int w,
1918 ConvolveParams *conv_params, uint8_t bit_depth, EbBool is_16bit) {
1919 // Derive subsampling from h and w passed in. May be refactored to
1920 // pass in subsampling factors directly.
1921 const int subh = (2 << mi_size_high_log2[sb_type]) == h;
1922 const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
1923 const uint8_t *mask = av1_get_compound_type_mask(comp_data, seg_mask, sb_type);
1924
1925 if (is_16bit) {
1926 svt_aom_highbd_blend_a64_d16_mask(dst,
1927 dst_stride,
1928 src0,
1929 src0_stride,
1930 src1,
1931 src1_stride,
1932 mask,
1933 block_size_wide[sb_type],
1934 w,
1935 h,
1936 subw,
1937 subh,
1938 conv_params,
1939 bit_depth);
1940 } else {
1941 svt_aom_lowbd_blend_a64_d16_mask(dst,
1942 dst_stride,
1943 src0,
1944 src0_stride,
1945 src1,
1946 src1_stride,
1947 mask,
1948 block_size_wide[sb_type],
1949 w,
1950 h,
1951 subw,
1952 subh,
1953 conv_params);
1954 }
1955 }
1956
1957
av1_find_ref_dv(IntMv * ref_dv,const TileInfo * const tile,int mib_size,int mi_row,int mi_col)1958 void av1_find_ref_dv(IntMv *ref_dv, const TileInfo *const tile, int mib_size, int mi_row,
1959 int mi_col) {
1960 (void)mi_col;
1961 if (mi_row - mib_size < tile->mi_row_start) {
1962 ref_dv->as_mv.row = 0;
1963 ref_dv->as_mv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
1964 } else {
1965 ref_dv->as_mv.row = -MI_SIZE * mib_size;
1966 ref_dv->as_mv.col = 0;
1967 }
1968 ref_dv->as_mv.row *= 8;
1969 ref_dv->as_mv.col *= 8;
1970 }
1971
1972 #define n_elements(x) (int32_t)(sizeof(x) / sizeof(x[0]))
1973
comp_ref0(int32_t ref_idx)1974 MvReferenceFrame comp_ref0(int32_t ref_idx) {
1975 static const MvReferenceFrame lut[] = {
1976 LAST_FRAME, // LAST_LAST2_FRAMES,
1977 LAST_FRAME, // LAST_LAST3_FRAMES,
1978 LAST_FRAME, // LAST_GOLDEN_FRAMES,
1979 BWDREF_FRAME, // BWDREF_ALTREF_FRAMES,
1980 LAST2_FRAME, // LAST2_LAST3_FRAMES
1981 LAST2_FRAME, // LAST2_GOLDEN_FRAMES,
1982 LAST3_FRAME, // LAST3_GOLDEN_FRAMES,
1983 BWDREF_FRAME, // BWDREF_ALTREF2_FRAMES,
1984 ALTREF2_FRAME, // ALTREF2_ALTREF_FRAMES,
1985 };
1986 assert(n_elements(lut) == TOTAL_UNIDIR_COMP_REFS);
1987 return lut[ref_idx];
1988 }
1989
comp_ref1(int32_t ref_idx)1990 MvReferenceFrame comp_ref1(int32_t ref_idx) {
1991 static const MvReferenceFrame lut[] = {
1992 LAST2_FRAME, // LAST_LAST2_FRAMES,
1993 LAST3_FRAME, // LAST_LAST3_FRAMES,
1994 GOLDEN_FRAME, // LAST_GOLDEN_FRAMES,
1995 ALTREF_FRAME, // BWDREF_ALTREF_FRAMES,
1996 LAST3_FRAME, // LAST2_LAST3_FRAMES
1997 GOLDEN_FRAME, // LAST2_GOLDEN_FRAMES,
1998 GOLDEN_FRAME, // LAST3_GOLDEN_FRAMES,
1999 ALTREF2_FRAME, // BWDREF_ALTREF2_FRAMES,
2000 ALTREF_FRAME, // ALTREF2_ALTREF_FRAMES,
2001 };
2002 assert(n_elements(lut) == TOTAL_UNIDIR_COMP_REFS);
2003 return lut[ref_idx];
2004 }
2005
get_uni_comp_ref_idx(const MvReferenceFrame * const rf)2006 int8_t get_uni_comp_ref_idx(const MvReferenceFrame *const rf) {
2007 // Single ref pred
2008 if (rf[1] <= INTRA_FRAME) return -1;
2009
2010 // Bi-directional comp ref pred
2011 if ((rf[0] < BWDREF_FRAME) && (rf[1] >= BWDREF_FRAME)) return -1;
2012
2013 for (int8_t ref_idx = 0; ref_idx < TOTAL_UNIDIR_COMP_REFS; ++ref_idx) {
2014 if (rf[0] == comp_ref0(ref_idx) && rf[1] == comp_ref1(ref_idx)) return ref_idx;
2015 }
2016 return -1;
2017 }
2018
av1_ref_frame_type(const MvReferenceFrame * const rf)2019 int8_t av1_ref_frame_type(const MvReferenceFrame *const rf) {
2020 if (rf[1] > INTRA_FRAME) {
2021 const int8_t uni_comp_ref_idx = get_uni_comp_ref_idx(rf);
2022 if (uni_comp_ref_idx >= 0) {
2023 assert((TOTAL_REFS_PER_FRAME + FWD_REFS * BWD_REFS + uni_comp_ref_idx) <
2024 MODE_CTX_REF_FRAMES);
2025 return TOTAL_REFS_PER_FRAME + FWD_REFS * BWD_REFS + uni_comp_ref_idx;
2026 } else {
2027 return TOTAL_REFS_PER_FRAME + FWD_RF_OFFSET(rf[0]) + BWD_RF_OFFSET(rf[1]) * FWD_REFS;
2028 }
2029 }
2030
2031 return rf[0];
2032 }
2033
2034 static MvReferenceFrame ref_frame_map[TOTAL_COMP_REFS][2] = {
2035 {LAST_FRAME, BWDREF_FRAME},
2036 {LAST2_FRAME, BWDREF_FRAME},
2037 {LAST3_FRAME, BWDREF_FRAME},
2038 {GOLDEN_FRAME, BWDREF_FRAME},
2039 {LAST_FRAME, ALTREF2_FRAME},
2040 {LAST2_FRAME, ALTREF2_FRAME},
2041 {LAST3_FRAME, ALTREF2_FRAME},
2042 {GOLDEN_FRAME, ALTREF2_FRAME},
2043 {LAST_FRAME, ALTREF_FRAME},
2044 {LAST2_FRAME, ALTREF_FRAME},
2045 {LAST3_FRAME, ALTREF_FRAME},
2046 {GOLDEN_FRAME, ALTREF_FRAME},
2047 {LAST_FRAME, LAST2_FRAME},
2048 {LAST_FRAME, LAST3_FRAME},
2049 {LAST_FRAME, GOLDEN_FRAME},
2050 {BWDREF_FRAME, ALTREF_FRAME},
2051 // NOTE: Following reference frame pairs are not supported to be explicitly
2052 // signalled, but they are possibly chosen by the use of skip_mode,
2053 // which may use the most recent one-sided reference frame pair.
2054 {LAST2_FRAME, LAST3_FRAME},
2055 {LAST2_FRAME, GOLDEN_FRAME},
2056 {LAST3_FRAME, GOLDEN_FRAME},
2057 {BWDREF_FRAME, ALTREF2_FRAME},
2058 {ALTREF2_FRAME, ALTREF_FRAME}};
2059
av1_set_ref_frame(MvReferenceFrame * rf,int8_t ref_frame_type)2060 void av1_set_ref_frame(MvReferenceFrame *rf, int8_t ref_frame_type) {
2061 if (ref_frame_type >= TOTAL_REFS_PER_FRAME) {
2062 rf[0] = ref_frame_map[ref_frame_type - TOTAL_REFS_PER_FRAME][0];
2063 rf[1] = ref_frame_map[ref_frame_type - TOTAL_REFS_PER_FRAME][1];
2064 } else {
2065 rf[0] = ref_frame_type;
2066 rf[1] = NONE_FRAME;
2067 // assert(ref_frame_type > NONE_FRAME); AMIR
2068 }
2069 }
2070
svt_av1_skip_u4x4_pred_in_obmc(BlockSize bsize,int dir,int subsampling_x,int subsampling_y)2071 int svt_av1_skip_u4x4_pred_in_obmc(BlockSize bsize, int dir, int subsampling_x, int subsampling_y) {
2072 assert(is_motion_variation_allowed_bsize(bsize));
2073
2074 const BlockSize bsize_plane = get_plane_block_size(bsize, subsampling_x, subsampling_y);
2075 switch (bsize_plane) {
2076 #if DISABLE_CHROMA_U8X8_OBMC
2077 case BLOCK_4X4:
2078 case BLOCK_8X4:
2079 case BLOCK_4X8: return 1; break;
2080 #else
2081 case BLOCK_4X4:
2082 case BLOCK_8X4:
2083 case BLOCK_4X8: return dir == 0; break;
2084 #endif
2085 default: return 0;
2086 }
2087 }
2088
2089 #define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
2090
2091 /**
2092 * Computes SSE of a compound predictor constructed from 2 fundamental
2093 * predictors p0 and p1 using blending with mask.
2094 *
2095 * r1: Residuals of p1.
2096 * (source - p1)
2097 * d: Difference of p1 and p0.
2098 * (p1 - p0)
2099 * m: The blending mask
2100 * N: Number of pixels
2101 *
2102 * 'r1', 'd', and 'm' are contiguous.
2103 *
2104 * Computes:
2105 * Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to:
2106 * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2),
2107 * where r0 is (source - p0), and r1 is (source - p1), which is in turn
2108 * is equivalent to:
2109 * Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2),
2110 * which is the SSE of the residuals of the compound predictor scaled up by
2111 * MAX_MASK_VALUE**2.
2112 *
2113 * Note that we clamp the partial term in the loop to 16 bits signed. This is
2114 * to facilitate equivalent SIMD implementation. It should have no effect if
2115 * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always
2116 * holds for 8 bit input, and on real input, it should hold practically always,
2117 * as residuals are expected to be small.
2118 */
svt_av1_wedge_sse_from_residuals_c(const int16_t * r1,const int16_t * d,const uint8_t * m,int N)2119 uint64_t svt_av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, const uint8_t *m,
2120 int N) {
2121 uint64_t csse = 0;
2122
2123 for (int i = 0; i < N; i++) {
2124 int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i];
2125 t = clamp(t, INT16_MIN, INT16_MAX);
2126 csse += t * t;
2127 }
2128 return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
2129 }
2130
2131
combine_interintra(InterIntraMode mode,int8_t use_wedge_interintra,int wedge_index,int wedge_sign,BlockSize bsize,BlockSize plane_bsize,uint8_t * comppred,int compstride,const uint8_t * interpred,int interstride,const uint8_t * intrapred,int intrastride)2132 void combine_interintra(InterIntraMode mode, int8_t use_wedge_interintra, int wedge_index,
2133 int wedge_sign, BlockSize bsize, BlockSize plane_bsize, uint8_t *comppred,
2134 int compstride, const uint8_t *interpred, int interstride,
2135 const uint8_t *intrapred, int intrastride) {
2136 const int bw = block_size_wide[plane_bsize];
2137 const int bh = block_size_high[plane_bsize];
2138
2139 if (use_wedge_interintra) {
2140 if (is_interintra_wedge_used(bsize)) {
2141 const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
2142 const int subw = 2 * mi_size_wide[bsize] == bw;
2143 const int subh = 2 * mi_size_high[bsize] == bh;
2144 svt_aom_blend_a64_mask(comppred,
2145 compstride,
2146 intrapred,
2147 intrastride,
2148 interpred,
2149 interstride,
2150 mask,
2151 block_size_wide[bsize],
2152 bw,
2153 bh,
2154 subw,
2155 subh);
2156 }
2157 return;
2158 } else {
2159 uint8_t mask[MAX_SB_SQUARE];
2160 build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
2161 svt_aom_blend_a64_mask(comppred,
2162 compstride,
2163 intrapred,
2164 intrastride,
2165 interpred,
2166 interstride,
2167 mask,
2168 bw,
2169 bw,
2170 bh,
2171 0,
2172 0);
2173 }
2174 }
2175
svt_aom_highbd_blend_a64_hmask_16bit_c(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h,int bd)2176 void svt_aom_highbd_blend_a64_hmask_16bit_c(uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
2177 uint32_t src0_stride, const uint16_t *src1,
2178 uint32_t src1_stride, const uint8_t *mask, int w, int h,
2179 int bd) {
2180 (void)bd;
2181
2182 assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
2183 assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
2184
2185 assert(h >= 1);
2186 assert(w >= 1);
2187 assert(IS_POWER_OF_TWO(h));
2188 assert(IS_POWER_OF_TWO(w));
2189
2190 assert(bd == 8 || bd == 10 || bd == 12);
2191
2192 for (int i = 0; i < h; ++i) {
2193 for (int j = 0; j < w; ++j) {
2194 dst[i * dst_stride + j] =
2195 AOM_BLEND_A64(mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
2196 }
2197 }
2198 }
2199
svt_aom_sum_squares_i16_c(const int16_t * src,uint32_t n)2200 uint64_t svt_aom_sum_squares_i16_c(const int16_t *src, uint32_t n) {
2201 uint64_t ss = 0;
2202 do {
2203 const int16_t v = *src++;
2204 ss += v * v;
2205 } while (--n);
2206
2207 return ss;
2208 }
2209
2210 // obmc_mask_N[overlap_position]
2211 static const uint8_t obmc_mask_1[1] = {64};
2212 DECLARE_ALIGNED(2, static const uint8_t, obmc_mask_2[2]) = {45, 64};
2213
2214 DECLARE_ALIGNED(4, static const uint8_t, obmc_mask_4[4]) = {39, 50, 59, 64};
2215
2216 static const uint8_t obmc_mask_8[8] = {36, 42, 48, 53, 57, 61, 64, 64};
2217
2218 static const uint8_t obmc_mask_16[16] = {
2219 34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 64, 64, 64, 64};
2220
2221 static const uint8_t obmc_mask_32[32] = {33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48,
2222 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 60,
2223 61, 62, 64, 64, 64, 64, 64, 64, 64, 64};
2224
2225 static const uint8_t obmc_mask_64[64] = {
2226 33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44, 45, 46, 47, 47, 48, 49,
2227 50, 51, 51, 51, 52, 52, 53, 54, 55, 56, 56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60,
2228 61, 62, 62, 62, 62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
2229 };
2230
svt_av1_get_obmc_mask(int length)2231 const uint8_t *svt_av1_get_obmc_mask(int length) {
2232 switch (length) {
2233 case 1: return obmc_mask_1;
2234 case 2: return obmc_mask_2;
2235 case 4: return obmc_mask_4;
2236 case 8: return obmc_mask_8;
2237 case 16: return obmc_mask_16;
2238 case 32: return obmc_mask_32;
2239 case 64: return obmc_mask_64;
2240 default: assert(0); return NULL;
2241 }
2242 }
2243