1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <assert.h>
12 #include <stdlib.h>
13 #include <string.h>
14
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 #include "config/av1_rtcd.h"
18
19 #include "aom/aom_integer.h"
20 #include "aom_ports/mem.h"
21
22 #include "aom_dsp/aom_filter.h"
23 #include "aom_dsp/blend.h"
24 #include "aom_dsp/variance.h"
25
26 #include "av1/common/av1_common_int.h"
27 #include "av1/common/filter.h"
28 #include "av1/common/reconinter.h"
29 #include "av1/encoder/reconinter_enc.h"
30
aom_get4x4sse_cs_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride)31 uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
32 int b_stride) {
33 int distortion = 0;
34 int r, c;
35
36 for (r = 0; r < 4; ++r) {
37 for (c = 0; c < 4; ++c) {
38 int diff = a[c] - b[c];
39 distortion += diff * diff;
40 }
41
42 a += a_stride;
43 b += b_stride;
44 }
45
46 return distortion;
47 }
48
aom_get_mb_ss_c(const int16_t * a)49 uint32_t aom_get_mb_ss_c(const int16_t *a) {
50 unsigned int i, sum = 0;
51
52 for (i = 0; i < 256; ++i) {
53 sum += a[i] * a[i];
54 }
55
56 return sum;
57 }
58
variance(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h,uint32_t * sse,int * sum)59 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
60 int b_stride, int w, int h, uint32_t *sse, int *sum) {
61 int i, j;
62
63 *sum = 0;
64 *sse = 0;
65
66 for (i = 0; i < h; ++i) {
67 for (j = 0; j < w; ++j) {
68 const int diff = a[j] - b[j];
69 *sum += diff;
70 *sse += diff * diff;
71 }
72
73 a += a_stride;
74 b += b_stride;
75 }
76 }
77
aom_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)78 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
79 int b_stride, int w, int h) {
80 uint32_t sse;
81 int sum;
82 variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
83 return sse;
84 }
85
86 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
87 // or vertical direction to produce the filtered output block. Used to implement
88 // the first-pass of 2-D separable filter.
89 //
90 // Produces int16_t output to retain precision for the next pass. Two filter
91 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
92 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
93 // It defines the offset required to move from one input to the next.
aom_var_filter_block2d_bil_first_pass_c(const uint8_t * a,uint16_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)94 void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b,
95 unsigned int src_pixels_per_line,
96 unsigned int pixel_step,
97 unsigned int output_height,
98 unsigned int output_width,
99 const uint8_t *filter) {
100 unsigned int i, j;
101
102 for (i = 0; i < output_height; ++i) {
103 for (j = 0; j < output_width; ++j) {
104 b[j] = ROUND_POWER_OF_TWO(
105 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
106
107 ++a;
108 }
109
110 a += src_pixels_per_line - output_width;
111 b += output_width;
112 }
113 }
114
115 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
116 // or vertical direction to produce the filtered output block. Used to implement
117 // the second-pass of 2-D separable filter.
118 //
119 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
120 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
121 // filter is applied horizontally (pixel_step = 1) or vertically
122 // (pixel_step = stride). It defines the offset required to move from one input
123 // to the next. Output is 8-bit.
aom_var_filter_block2d_bil_second_pass_c(const uint16_t * a,uint8_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)124 void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
125 unsigned int src_pixels_per_line,
126 unsigned int pixel_step,
127 unsigned int output_height,
128 unsigned int output_width,
129 const uint8_t *filter) {
130 unsigned int i, j;
131
132 for (i = 0; i < output_height; ++i) {
133 for (j = 0; j < output_width; ++j) {
134 b[j] = ROUND_POWER_OF_TWO(
135 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
136 ++a;
137 }
138
139 a += src_pixels_per_line - output_width;
140 b += output_width;
141 }
142 }
143
144 #define VAR(W, H) \
145 uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
146 const uint8_t *b, int b_stride, \
147 uint32_t *sse) { \
148 int sum; \
149 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
150 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
151 }
152
153 #define SUBPIX_VAR(W, H) \
154 uint32_t aom_sub_pixel_variance##W##x##H##_c( \
155 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
156 const uint8_t *b, int b_stride, uint32_t *sse) { \
157 uint16_t fdata3[(H + 1) * W]; \
158 uint8_t temp2[H * W]; \
159 \
160 aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
161 bilinear_filters_2t[xoffset]); \
162 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
163 bilinear_filters_2t[yoffset]); \
164 \
165 return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
166 }
167
168 #define SUBPIX_AVG_VAR(W, H) \
169 uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \
170 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
171 const uint8_t *b, int b_stride, uint32_t *sse, \
172 const uint8_t *second_pred) { \
173 uint16_t fdata3[(H + 1) * W]; \
174 uint8_t temp2[H * W]; \
175 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
176 \
177 aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
178 bilinear_filters_2t[xoffset]); \
179 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
180 bilinear_filters_2t[yoffset]); \
181 \
182 aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
183 \
184 return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
185 } \
186 uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
187 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
188 const uint8_t *b, int b_stride, uint32_t *sse, \
189 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
190 uint16_t fdata3[(H + 1) * W]; \
191 uint8_t temp2[H * W]; \
192 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
193 \
194 aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
195 bilinear_filters_2t[xoffset]); \
196 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
197 bilinear_filters_2t[yoffset]); \
198 \
199 aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
200 \
201 return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
202 }
203
204 /* Identical to the variance call except it takes an additional parameter, sum,
205 * and returns that value using pass-by-reference instead of returning
206 * sse - sum^2 / w*h
207 */
208 #define GET_VAR(W, H) \
209 void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
210 const uint8_t *b, int b_stride, uint32_t *sse, \
211 int *sum) { \
212 variance(a, a_stride, b, b_stride, W, H, sse, sum); \
213 }
214
215 /* Identical to the variance call except it does not calculate the
216 * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
217 * variable.
218 */
219 #define MSE(W, H) \
220 uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
221 const uint8_t *b, int b_stride, \
222 uint32_t *sse) { \
223 int sum; \
224 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
225 return *sse; \
226 }
227
228 /* All three forms of the variance are available in the same sizes. */
229 #define VARIANCES(W, H) \
230 VAR(W, H) \
231 SUBPIX_VAR(W, H) \
232 SUBPIX_AVG_VAR(W, H)
233
234 VARIANCES(128, 128)
235 VARIANCES(128, 64)
236 VARIANCES(64, 128)
237 VARIANCES(64, 64)
238 VARIANCES(64, 32)
239 VARIANCES(32, 64)
240 VARIANCES(32, 32)
241 VARIANCES(32, 16)
242 VARIANCES(16, 32)
243 VARIANCES(16, 16)
244 VARIANCES(16, 8)
245 VARIANCES(8, 16)
246 VARIANCES(8, 8)
247 VARIANCES(8, 4)
248 VARIANCES(4, 8)
249 VARIANCES(4, 4)
250 VARIANCES(4, 2)
251 VARIANCES(2, 4)
252 VARIANCES(2, 2)
253
254 // Realtime mode doesn't use rectangular blocks.
255 #if !CONFIG_REALTIME_ONLY
256 VARIANCES(4, 16)
257 VARIANCES(16, 4)
258 VARIANCES(8, 32)
259 VARIANCES(32, 8)
260 VARIANCES(16, 64)
261 VARIANCES(64, 16)
262 #endif
263
264 GET_VAR(16, 16)
265 GET_VAR(8, 8)
266
267 MSE(16, 16)
268 MSE(16, 8)
269 MSE(8, 16)
270 MSE(8, 8)
271
aom_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride)272 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
273 int height, const uint8_t *ref, int ref_stride) {
274 int i, j;
275
276 for (i = 0; i < height; ++i) {
277 for (j = 0; j < width; ++j) {
278 const int tmp = pred[j] + ref[j];
279 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
280 }
281 comp_pred += width;
282 pred += width;
283 ref += ref_stride;
284 }
285 }
286
287 // Get pred block from up-sampled reference.
aom_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,int subpel_search)288 void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
289 int mi_row, int mi_col, const MV *const mv,
290 uint8_t *comp_pred, int width, int height,
291 int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
292 int ref_stride, int subpel_search) {
293 // expect xd == NULL only in tests
294 if (xd != NULL) {
295 const MB_MODE_INFO *mi = xd->mi[0];
296 const int ref_num = 0;
297 const int is_intrabc = is_intrabc_block(mi);
298 const struct scale_factors *const sf =
299 is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
300 const int is_scaled = av1_is_scaled(sf);
301
302 if (is_scaled) {
303 int plane = 0;
304 const int mi_x = mi_col * MI_SIZE;
305 const int mi_y = mi_row * MI_SIZE;
306 const struct macroblockd_plane *const pd = &xd->plane[plane];
307 const struct buf_2d *const dst_buf = &pd->dst;
308 const struct buf_2d *const pre_buf =
309 is_intrabc ? dst_buf : &pd->pre[ref_num];
310
311 InterPredParams inter_pred_params;
312 inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
313 const int_interpfilters filters =
314 av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
315 av1_init_inter_params(
316 &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
317 mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
318 xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
319 av1_enc_build_one_inter_predictor(comp_pred, width, mv,
320 &inter_pred_params);
321 return;
322 }
323 }
324
325 const InterpFilterParams *filter = av1_get_filter(subpel_search);
326
327 if (!subpel_x_q3 && !subpel_y_q3) {
328 for (int i = 0; i < height; i++) {
329 memcpy(comp_pred, ref, width * sizeof(*comp_pred));
330 comp_pred += width;
331 ref += ref_stride;
332 }
333 } else if (!subpel_y_q3) {
334 const int16_t *const kernel =
335 av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
336 aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
337 -1, width, height);
338 } else if (!subpel_x_q3) {
339 const int16_t *const kernel =
340 av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
341 aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
342 16, width, height);
343 } else {
344 DECLARE_ALIGNED(16, uint8_t,
345 temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
346 const int16_t *const kernel_x =
347 av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
348 const int16_t *const kernel_y =
349 av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
350 const int intermediate_height =
351 (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
352 assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
353 aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
354 ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
355 width, intermediate_height);
356 aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
357 MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
358 width, height);
359 }
360 }
361
aom_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,int subpel_search)362 void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
363 int mi_row, int mi_col, const MV *const mv,
364 uint8_t *comp_pred, const uint8_t *pred,
365 int width, int height, int subpel_x_q3,
366 int subpel_y_q3, const uint8_t *ref,
367 int ref_stride, int subpel_search) {
368 int i, j;
369
370 aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
371 subpel_x_q3, subpel_y_q3, ref, ref_stride,
372 subpel_search);
373 for (i = 0; i < height; i++) {
374 for (j = 0; j < width; j++) {
375 comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
376 }
377 comp_pred += width;
378 pred += width;
379 }
380 }
381
aom_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)382 void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
383 int width, int height, const uint8_t *ref,
384 int ref_stride,
385 const DIST_WTD_COMP_PARAMS *jcp_param) {
386 int i, j;
387 const int fwd_offset = jcp_param->fwd_offset;
388 const int bck_offset = jcp_param->bck_offset;
389
390 for (i = 0; i < height; ++i) {
391 for (j = 0; j < width; ++j) {
392 int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
393 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
394 comp_pred[j] = (uint8_t)tmp;
395 }
396 comp_pred += width;
397 pred += width;
398 ref += ref_stride;
399 }
400 }
401
aom_dist_wtd_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param,int subpel_search)402 void aom_dist_wtd_comp_avg_upsampled_pred_c(
403 MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
404 const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
405 int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
406 int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
407 int i, j;
408 const int fwd_offset = jcp_param->fwd_offset;
409 const int bck_offset = jcp_param->bck_offset;
410
411 aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
412 subpel_x_q3, subpel_y_q3, ref, ref_stride,
413 subpel_search);
414
415 for (i = 0; i < height; i++) {
416 for (j = 0; j < width; j++) {
417 int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
418 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
419 comp_pred[j] = (uint8_t)tmp;
420 }
421 comp_pred += width;
422 pred += width;
423 }
424 }
425
426 #if CONFIG_AV1_HIGHBITDEPTH
highbd_variance64(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint64_t * sse,int64_t * sum)427 static void highbd_variance64(const uint8_t *a8, int a_stride,
428 const uint8_t *b8, int b_stride, int w, int h,
429 uint64_t *sse, int64_t *sum) {
430 const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
431 const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
432 int64_t tsum = 0;
433 uint64_t tsse = 0;
434 for (int i = 0; i < h; ++i) {
435 int32_t lsum = 0;
436 for (int j = 0; j < w; ++j) {
437 const int diff = a[j] - b[j];
438 lsum += diff;
439 tsse += (uint32_t)(diff * diff);
440 }
441 tsum += lsum;
442 a += a_stride;
443 b += b_stride;
444 }
445 *sum = tsum;
446 *sse = tsse;
447 }
448
aom_highbd_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)449 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
450 const uint8_t *b, int b_stride, int w, int h) {
451 uint64_t sse;
452 int64_t sum;
453 highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
454 return sse;
455 }
456
highbd_8_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)457 static void highbd_8_variance(const uint8_t *a8, int a_stride,
458 const uint8_t *b8, int b_stride, int w, int h,
459 uint32_t *sse, int *sum) {
460 uint64_t sse_long = 0;
461 int64_t sum_long = 0;
462 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
463 *sse = (uint32_t)sse_long;
464 *sum = (int)sum_long;
465 }
466
highbd_10_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)467 static void highbd_10_variance(const uint8_t *a8, int a_stride,
468 const uint8_t *b8, int b_stride, int w, int h,
469 uint32_t *sse, int *sum) {
470 uint64_t sse_long = 0;
471 int64_t sum_long = 0;
472 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
473 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
474 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
475 }
476
highbd_12_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)477 static void highbd_12_variance(const uint8_t *a8, int a_stride,
478 const uint8_t *b8, int b_stride, int w, int h,
479 uint32_t *sse, int *sum) {
480 uint64_t sse_long = 0;
481 int64_t sum_long = 0;
482 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
483 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
484 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
485 }
486
487 #define HIGHBD_VAR(W, H) \
488 uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
489 const uint8_t *b, int b_stride, \
490 uint32_t *sse) { \
491 int sum; \
492 highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
493 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
494 } \
495 \
496 uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
497 const uint8_t *b, int b_stride, \
498 uint32_t *sse) { \
499 int sum; \
500 int64_t var; \
501 highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
502 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
503 return (var >= 0) ? (uint32_t)var : 0; \
504 } \
505 \
506 uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
507 const uint8_t *b, int b_stride, \
508 uint32_t *sse) { \
509 int sum; \
510 int64_t var; \
511 highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
512 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
513 return (var >= 0) ? (uint32_t)var : 0; \
514 }
515
516 #define HIGHBD_GET_VAR(S) \
517 void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
518 const uint8_t *ref, int ref_stride, \
519 uint32_t *sse, int *sum) { \
520 highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
521 } \
522 \
523 void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
524 const uint8_t *ref, int ref_stride, \
525 uint32_t *sse, int *sum) { \
526 highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
527 } \
528 \
529 void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
530 const uint8_t *ref, int ref_stride, \
531 uint32_t *sse, int *sum) { \
532 highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
533 }
534
535 #define HIGHBD_MSE(W, H) \
536 uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
537 const uint8_t *ref, int ref_stride, \
538 uint32_t *sse) { \
539 int sum; \
540 highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
541 return *sse; \
542 } \
543 \
544 uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
545 const uint8_t *ref, int ref_stride, \
546 uint32_t *sse) { \
547 int sum; \
548 highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
549 return *sse; \
550 } \
551 \
552 uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
553 const uint8_t *ref, int ref_stride, \
554 uint32_t *sse) { \
555 int sum; \
556 highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
557 return *sse; \
558 }
559
aom_highbd_var_filter_block2d_bil_first_pass(const uint8_t * src_ptr8,uint16_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)560 void aom_highbd_var_filter_block2d_bil_first_pass(
561 const uint8_t *src_ptr8, uint16_t *output_ptr,
562 unsigned int src_pixels_per_line, int pixel_step,
563 unsigned int output_height, unsigned int output_width,
564 const uint8_t *filter) {
565 unsigned int i, j;
566 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
567 for (i = 0; i < output_height; ++i) {
568 for (j = 0; j < output_width; ++j) {
569 output_ptr[j] = ROUND_POWER_OF_TWO(
570 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
571 FILTER_BITS);
572
573 ++src_ptr;
574 }
575
576 // Next row...
577 src_ptr += src_pixels_per_line - output_width;
578 output_ptr += output_width;
579 }
580 }
581
aom_highbd_var_filter_block2d_bil_second_pass(const uint16_t * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)582 void aom_highbd_var_filter_block2d_bil_second_pass(
583 const uint16_t *src_ptr, uint16_t *output_ptr,
584 unsigned int src_pixels_per_line, unsigned int pixel_step,
585 unsigned int output_height, unsigned int output_width,
586 const uint8_t *filter) {
587 unsigned int i, j;
588
589 for (i = 0; i < output_height; ++i) {
590 for (j = 0; j < output_width; ++j) {
591 output_ptr[j] = ROUND_POWER_OF_TWO(
592 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
593 FILTER_BITS);
594 ++src_ptr;
595 }
596
597 src_ptr += src_pixels_per_line - output_width;
598 output_ptr += output_width;
599 }
600 }
601
602 #define HIGHBD_SUBPIX_VAR(W, H) \
603 uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c( \
604 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
605 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
606 uint16_t fdata3[(H + 1) * W]; \
607 uint16_t temp2[H * W]; \
608 \
609 aom_highbd_var_filter_block2d_bil_first_pass( \
610 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
611 aom_highbd_var_filter_block2d_bil_second_pass( \
612 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
613 \
614 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
615 dst, dst_stride, sse); \
616 } \
617 \
618 uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c( \
619 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
620 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
621 uint16_t fdata3[(H + 1) * W]; \
622 uint16_t temp2[H * W]; \
623 \
624 aom_highbd_var_filter_block2d_bil_first_pass( \
625 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
626 aom_highbd_var_filter_block2d_bil_second_pass( \
627 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
628 \
629 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
630 dst, dst_stride, sse); \
631 } \
632 \
633 uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c( \
634 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
635 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
636 uint16_t fdata3[(H + 1) * W]; \
637 uint16_t temp2[H * W]; \
638 \
639 aom_highbd_var_filter_block2d_bil_first_pass( \
640 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
641 aom_highbd_var_filter_block2d_bil_second_pass( \
642 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
643 \
644 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
645 dst, dst_stride, sse); \
646 }
647
648 #define HIGHBD_SUBPIX_AVG_VAR(W, H) \
649 uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
650 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
651 const uint8_t *dst, int dst_stride, uint32_t *sse, \
652 const uint8_t *second_pred) { \
653 uint16_t fdata3[(H + 1) * W]; \
654 uint16_t temp2[H * W]; \
655 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
656 \
657 aom_highbd_var_filter_block2d_bil_first_pass( \
658 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
659 aom_highbd_var_filter_block2d_bil_second_pass( \
660 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
661 \
662 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
663 CONVERT_TO_BYTEPTR(temp2), W); \
664 \
665 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
666 dst, dst_stride, sse); \
667 } \
668 \
669 uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
670 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
671 const uint8_t *dst, int dst_stride, uint32_t *sse, \
672 const uint8_t *second_pred) { \
673 uint16_t fdata3[(H + 1) * W]; \
674 uint16_t temp2[H * W]; \
675 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
676 \
677 aom_highbd_var_filter_block2d_bil_first_pass( \
678 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
679 aom_highbd_var_filter_block2d_bil_second_pass( \
680 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
681 \
682 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
683 CONVERT_TO_BYTEPTR(temp2), W); \
684 \
685 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
686 dst, dst_stride, sse); \
687 } \
688 \
689 uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
690 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
691 const uint8_t *dst, int dst_stride, uint32_t *sse, \
692 const uint8_t *second_pred) { \
693 uint16_t fdata3[(H + 1) * W]; \
694 uint16_t temp2[H * W]; \
695 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
696 \
697 aom_highbd_var_filter_block2d_bil_first_pass( \
698 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
699 aom_highbd_var_filter_block2d_bil_second_pass( \
700 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
701 \
702 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
703 CONVERT_TO_BYTEPTR(temp2), W); \
704 \
705 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
706 dst, dst_stride, sse); \
707 } \
708 \
709 uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
710 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
711 const uint8_t *dst, int dst_stride, uint32_t *sse, \
712 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
713 uint16_t fdata3[(H + 1) * W]; \
714 uint16_t temp2[H * W]; \
715 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
716 \
717 aom_highbd_var_filter_block2d_bil_first_pass( \
718 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
719 aom_highbd_var_filter_block2d_bil_second_pass( \
720 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
721 \
722 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
723 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
724 jcp_param); \
725 \
726 return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
727 dst_stride, sse); \
728 } \
729 \
730 uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
731 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
732 const uint8_t *dst, int dst_stride, uint32_t *sse, \
733 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
734 uint16_t fdata3[(H + 1) * W]; \
735 uint16_t temp2[H * W]; \
736 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
737 \
738 aom_highbd_var_filter_block2d_bil_first_pass( \
739 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
740 aom_highbd_var_filter_block2d_bil_second_pass( \
741 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
742 \
743 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
744 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
745 jcp_param); \
746 \
747 return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
748 dst_stride, sse); \
749 } \
750 \
751 uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
752 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
753 const uint8_t *dst, int dst_stride, uint32_t *sse, \
754 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
755 uint16_t fdata3[(H + 1) * W]; \
756 uint16_t temp2[H * W]; \
757 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
758 \
759 aom_highbd_var_filter_block2d_bil_first_pass( \
760 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
761 aom_highbd_var_filter_block2d_bil_second_pass( \
762 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
763 \
764 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
765 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
766 jcp_param); \
767 \
768 return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
769 dst_stride, sse); \
770 }
771
772 /* All three forms of the variance are available in the same sizes. */
773 #define HIGHBD_VARIANCES(W, H) \
774 HIGHBD_VAR(W, H) \
775 HIGHBD_SUBPIX_VAR(W, H) \
776 HIGHBD_SUBPIX_AVG_VAR(W, H)
777
778 HIGHBD_VARIANCES(128, 128)
779 HIGHBD_VARIANCES(128, 64)
780 HIGHBD_VARIANCES(64, 128)
781 HIGHBD_VARIANCES(64, 64)
782 HIGHBD_VARIANCES(64, 32)
783 HIGHBD_VARIANCES(32, 64)
784 HIGHBD_VARIANCES(32, 32)
785 HIGHBD_VARIANCES(32, 16)
786 HIGHBD_VARIANCES(16, 32)
787 HIGHBD_VARIANCES(16, 16)
788 HIGHBD_VARIANCES(16, 8)
789 HIGHBD_VARIANCES(8, 16)
790 HIGHBD_VARIANCES(8, 8)
791 HIGHBD_VARIANCES(8, 4)
792 HIGHBD_VARIANCES(4, 8)
793 HIGHBD_VARIANCES(4, 4)
794 HIGHBD_VARIANCES(4, 2)
795 HIGHBD_VARIANCES(2, 4)
796 HIGHBD_VARIANCES(2, 2)
797
798 // Realtime mode doesn't use 4x rectangular blocks.
799 #if !CONFIG_REALTIME_ONLY
800 HIGHBD_VARIANCES(4, 16)
801 HIGHBD_VARIANCES(16, 4)
802 HIGHBD_VARIANCES(8, 32)
803 HIGHBD_VARIANCES(32, 8)
804 HIGHBD_VARIANCES(16, 64)
805 HIGHBD_VARIANCES(64, 16)
806 #endif
807
808 HIGHBD_GET_VAR(8)
809 HIGHBD_GET_VAR(16)
810
811 HIGHBD_MSE(16, 16)
812 HIGHBD_MSE(16, 8)
813 HIGHBD_MSE(8, 16)
814 HIGHBD_MSE(8, 8)
815
aom_highbd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride)816 void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
817 int width, int height, const uint8_t *ref8,
818 int ref_stride) {
819 int i, j;
820 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
821 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
822 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
823 for (i = 0; i < height; ++i) {
824 for (j = 0; j < width; ++j) {
825 const int tmp = pred[j] + ref[j];
826 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
827 }
828 comp_pred += width;
829 pred += width;
830 ref += ref_stride;
831 }
832 }
833
aom_highbd_upsampled_pred_c(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,int subpel_search)834 void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
835 const struct AV1Common *const cm, int mi_row,
836 int mi_col, const MV *const mv,
837 uint8_t *comp_pred8, int width, int height,
838 int subpel_x_q3, int subpel_y_q3,
839 const uint8_t *ref8, int ref_stride, int bd,
840 int subpel_search) {
841 // expect xd == NULL only in tests
842 if (xd != NULL) {
843 const MB_MODE_INFO *mi = xd->mi[0];
844 const int ref_num = 0;
845 const int is_intrabc = is_intrabc_block(mi);
846 const struct scale_factors *const sf =
847 is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
848 const int is_scaled = av1_is_scaled(sf);
849
850 if (is_scaled) {
851 int plane = 0;
852 const int mi_x = mi_col * MI_SIZE;
853 const int mi_y = mi_row * MI_SIZE;
854 const struct macroblockd_plane *const pd = &xd->plane[plane];
855 const struct buf_2d *const dst_buf = &pd->dst;
856 const struct buf_2d *const pre_buf =
857 is_intrabc ? dst_buf : &pd->pre[ref_num];
858
859 InterPredParams inter_pred_params;
860 inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
861 const int_interpfilters filters =
862 av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
863 av1_init_inter_params(
864 &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
865 mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
866 xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
867 av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
868 &inter_pred_params);
869 return;
870 }
871 }
872
873 const InterpFilterParams *filter = av1_get_filter(subpel_search);
874
875 if (!subpel_x_q3 && !subpel_y_q3) {
876 const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
877 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
878 for (int i = 0; i < height; i++) {
879 memcpy(comp_pred, ref, width * sizeof(*comp_pred));
880 comp_pred += width;
881 ref += ref_stride;
882 }
883 } else if (!subpel_y_q3) {
884 const int16_t *const kernel =
885 av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
886 aom_highbd_convolve8_horiz_c(ref8, ref_stride, comp_pred8, width, kernel,
887 16, NULL, -1, width, height, bd);
888 } else if (!subpel_x_q3) {
889 const int16_t *const kernel =
890 av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
891 aom_highbd_convolve8_vert_c(ref8, ref_stride, comp_pred8, width, NULL, -1,
892 kernel, 16, width, height, bd);
893 } else {
894 DECLARE_ALIGNED(16, uint16_t,
895 temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
896 const int16_t *const kernel_x =
897 av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
898 const int16_t *const kernel_y =
899 av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
900 const int intermediate_height =
901 (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
902 assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
903 aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter->taps >> 1) - 1),
904 ref_stride, CONVERT_TO_BYTEPTR(temp),
905 MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
906 intermediate_height, bd);
907 aom_highbd_convolve8_vert_c(
908 CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
909 MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
910 bd);
911 }
912 }
913
aom_highbd_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,int subpel_search)914 void aom_highbd_comp_avg_upsampled_pred_c(
915 MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
916 const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
917 int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
918 int ref_stride, int bd, int subpel_search) {
919 int i, j;
920
921 const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
922 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
923 aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
924 height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
925 bd, subpel_search);
926 for (i = 0; i < height; ++i) {
927 for (j = 0; j < width; ++j) {
928 comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
929 }
930 comp_pred += width;
931 pred += width;
932 }
933 }
934
aom_highbd_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)935 void aom_highbd_dist_wtd_comp_avg_pred_c(
936 uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
937 const uint8_t *ref8, int ref_stride,
938 const DIST_WTD_COMP_PARAMS *jcp_param) {
939 int i, j;
940 const int fwd_offset = jcp_param->fwd_offset;
941 const int bck_offset = jcp_param->bck_offset;
942 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
943 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
944 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
945
946 for (i = 0; i < height; ++i) {
947 for (j = 0; j < width; ++j) {
948 int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
949 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
950 comp_pred[j] = (uint16_t)tmp;
951 }
952 comp_pred += width;
953 pred += width;
954 ref += ref_stride;
955 }
956 }
957
aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,const DIST_WTD_COMP_PARAMS * jcp_param,int subpel_search)958 void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
959 MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
960 const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
961 int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
962 int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
963 int subpel_search) {
964 int i, j;
965 const int fwd_offset = jcp_param->fwd_offset;
966 const int bck_offset = jcp_param->bck_offset;
967 const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
968 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
969 aom_highbd_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
970 height, subpel_x_q3, subpel_y_q3, ref8,
971 ref_stride, bd, subpel_search);
972
973 for (i = 0; i < height; i++) {
974 for (j = 0; j < width; j++) {
975 int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
976 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
977 comp_pred[j] = (uint16_t)tmp;
978 }
979 comp_pred += width;
980 pred += width;
981 }
982 }
983 #endif // CONFIG_AV1_HIGHBITDEPTH
984
aom_comp_mask_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)985 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
986 int height, const uint8_t *ref, int ref_stride,
987 const uint8_t *mask, int mask_stride,
988 int invert_mask) {
989 int i, j;
990 const uint8_t *src0 = invert_mask ? pred : ref;
991 const uint8_t *src1 = invert_mask ? ref : pred;
992 const int stride0 = invert_mask ? width : ref_stride;
993 const int stride1 = invert_mask ? ref_stride : width;
994 for (i = 0; i < height; ++i) {
995 for (j = 0; j < width; ++j) {
996 comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
997 }
998 comp_pred += width;
999 src0 += stride0;
1000 src1 += stride1;
1001 mask += mask_stride;
1002 }
1003 }
1004
aom_comp_mask_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask,int subpel_search)1005 void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
1006 int mi_row, int mi_col, const MV *const mv,
1007 uint8_t *comp_pred, const uint8_t *pred,
1008 int width, int height, int subpel_x_q3,
1009 int subpel_y_q3, const uint8_t *ref,
1010 int ref_stride, const uint8_t *mask,
1011 int mask_stride, int invert_mask,
1012 int subpel_search) {
1013 if (subpel_x_q3 | subpel_y_q3) {
1014 aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
1015 subpel_x_q3, subpel_y_q3, ref, ref_stride,
1016 subpel_search);
1017 ref = comp_pred;
1018 ref_stride = width;
1019 }
1020 aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask,
1021 mask_stride, invert_mask);
1022 }
1023
1024 #define MASK_SUBPIX_VAR(W, H) \
1025 unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \
1026 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
1027 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
1028 const uint8_t *msk, int msk_stride, int invert_mask, \
1029 unsigned int *sse) { \
1030 uint16_t fdata3[(H + 1) * W]; \
1031 uint8_t temp2[H * W]; \
1032 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
1033 \
1034 aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \
1035 W, bilinear_filters_2t[xoffset]); \
1036 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
1037 bilinear_filters_2t[yoffset]); \
1038 \
1039 aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
1040 invert_mask); \
1041 return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \
1042 }
1043
1044 MASK_SUBPIX_VAR(4, 4)
1045 MASK_SUBPIX_VAR(4, 8)
1046 MASK_SUBPIX_VAR(8, 4)
1047 MASK_SUBPIX_VAR(8, 8)
1048 MASK_SUBPIX_VAR(8, 16)
1049 MASK_SUBPIX_VAR(16, 8)
1050 MASK_SUBPIX_VAR(16, 16)
1051 MASK_SUBPIX_VAR(16, 32)
1052 MASK_SUBPIX_VAR(32, 16)
1053 MASK_SUBPIX_VAR(32, 32)
1054 MASK_SUBPIX_VAR(32, 64)
1055 MASK_SUBPIX_VAR(64, 32)
1056 MASK_SUBPIX_VAR(64, 64)
1057 MASK_SUBPIX_VAR(64, 128)
1058 MASK_SUBPIX_VAR(128, 64)
1059 MASK_SUBPIX_VAR(128, 128)
1060
1061 // Realtime mode doesn't use 4x rectangular blocks.
1062 #if !CONFIG_REALTIME_ONLY
1063 MASK_SUBPIX_VAR(4, 16)
1064 MASK_SUBPIX_VAR(16, 4)
1065 MASK_SUBPIX_VAR(8, 32)
1066 MASK_SUBPIX_VAR(32, 8)
1067 MASK_SUBPIX_VAR(16, 64)
1068 MASK_SUBPIX_VAR(64, 16)
1069 #endif
1070
1071 #if CONFIG_AV1_HIGHBITDEPTH
aom_highbd_comp_mask_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)1072 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
1073 int width, int height, const uint8_t *ref8,
1074 int ref_stride, const uint8_t *mask,
1075 int mask_stride, int invert_mask) {
1076 int i, j;
1077 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
1078 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
1079 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
1080 for (i = 0; i < height; ++i) {
1081 for (j = 0; j < width; ++j) {
1082 if (!invert_mask)
1083 comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
1084 else
1085 comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
1086 }
1087 comp_pred += width;
1088 pred += width;
1089 ref += ref_stride;
1090 mask += mask_stride;
1091 }
1092 }
1093
aom_highbd_comp_mask_upsampled_pred(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask,int bd,int subpel_search)1094 void aom_highbd_comp_mask_upsampled_pred(
1095 MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
1096 const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
1097 int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
1098 int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
1099 int bd, int subpel_search) {
1100 aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
1101 height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
1102 bd, subpel_search);
1103 aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width,
1104 mask, mask_stride, invert_mask);
1105 }
1106
1107 #define HIGHBD_MASK_SUBPIX_VAR(W, H) \
1108 unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c( \
1109 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
1110 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
1111 const uint8_t *msk, int msk_stride, int invert_mask, \
1112 unsigned int *sse) { \
1113 uint16_t fdata3[(H + 1) * W]; \
1114 uint16_t temp2[H * W]; \
1115 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
1116 \
1117 aom_highbd_var_filter_block2d_bil_first_pass( \
1118 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1119 aom_highbd_var_filter_block2d_bil_second_pass( \
1120 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1121 \
1122 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
1123 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
1124 invert_mask); \
1125 \
1126 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
1127 ref, ref_stride, sse); \
1128 } \
1129 \
1130 unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \
1131 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
1132 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
1133 const uint8_t *msk, int msk_stride, int invert_mask, \
1134 unsigned int *sse) { \
1135 uint16_t fdata3[(H + 1) * W]; \
1136 uint16_t temp2[H * W]; \
1137 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
1138 \
1139 aom_highbd_var_filter_block2d_bil_first_pass( \
1140 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1141 aom_highbd_var_filter_block2d_bil_second_pass( \
1142 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1143 \
1144 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
1145 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
1146 invert_mask); \
1147 \
1148 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
1149 ref, ref_stride, sse); \
1150 } \
1151 \
1152 unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \
1153 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
1154 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
1155 const uint8_t *msk, int msk_stride, int invert_mask, \
1156 unsigned int *sse) { \
1157 uint16_t fdata3[(H + 1) * W]; \
1158 uint16_t temp2[H * W]; \
1159 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
1160 \
1161 aom_highbd_var_filter_block2d_bil_first_pass( \
1162 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1163 aom_highbd_var_filter_block2d_bil_second_pass( \
1164 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1165 \
1166 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
1167 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
1168 invert_mask); \
1169 \
1170 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
1171 ref, ref_stride, sse); \
1172 }
1173
1174 HIGHBD_MASK_SUBPIX_VAR(4, 4)
1175 HIGHBD_MASK_SUBPIX_VAR(4, 8)
1176 HIGHBD_MASK_SUBPIX_VAR(8, 4)
1177 HIGHBD_MASK_SUBPIX_VAR(8, 8)
1178 HIGHBD_MASK_SUBPIX_VAR(8, 16)
1179 HIGHBD_MASK_SUBPIX_VAR(16, 8)
1180 HIGHBD_MASK_SUBPIX_VAR(16, 16)
1181 HIGHBD_MASK_SUBPIX_VAR(16, 32)
1182 HIGHBD_MASK_SUBPIX_VAR(32, 16)
1183 HIGHBD_MASK_SUBPIX_VAR(32, 32)
1184 HIGHBD_MASK_SUBPIX_VAR(32, 64)
1185 HIGHBD_MASK_SUBPIX_VAR(64, 32)
1186 HIGHBD_MASK_SUBPIX_VAR(64, 64)
1187 HIGHBD_MASK_SUBPIX_VAR(64, 128)
1188 HIGHBD_MASK_SUBPIX_VAR(128, 64)
1189 HIGHBD_MASK_SUBPIX_VAR(128, 128)
1190 #if !CONFIG_REALTIME_ONLY
1191 HIGHBD_MASK_SUBPIX_VAR(4, 16)
1192 HIGHBD_MASK_SUBPIX_VAR(16, 4)
1193 HIGHBD_MASK_SUBPIX_VAR(8, 32)
1194 HIGHBD_MASK_SUBPIX_VAR(32, 8)
1195 HIGHBD_MASK_SUBPIX_VAR(16, 64)
1196 HIGHBD_MASK_SUBPIX_VAR(64, 16)
1197 #endif
1198 #endif // CONFIG_AV1_HIGHBITDEPTH
1199
1200 #if !CONFIG_REALTIME_ONLY
obmc_variance(const uint8_t * pre,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1201 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
1202 const int32_t *wsrc, const int32_t *mask,
1203 int w, int h, unsigned int *sse, int *sum) {
1204 int i, j;
1205
1206 *sse = 0;
1207 *sum = 0;
1208
1209 for (i = 0; i < h; i++) {
1210 for (j = 0; j < w; j++) {
1211 int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1212 *sum += diff;
1213 *sse += diff * diff;
1214 }
1215
1216 pre += pre_stride;
1217 wsrc += w;
1218 mask += w;
1219 }
1220 }
1221
1222 #define OBMC_VAR(W, H) \
1223 unsigned int aom_obmc_variance##W##x##H##_c( \
1224 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1225 const int32_t *mask, unsigned int *sse) { \
1226 int sum; \
1227 obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1228 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
1229 }
1230
1231 #define OBMC_SUBPIX_VAR(W, H) \
1232 unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \
1233 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1234 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1235 uint16_t fdata3[(H + 1) * W]; \
1236 uint8_t temp2[H * W]; \
1237 \
1238 aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \
1239 W, bilinear_filters_2t[xoffset]); \
1240 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
1241 bilinear_filters_2t[yoffset]); \
1242 \
1243 return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \
1244 }
1245
1246 OBMC_VAR(4, 4)
1247 OBMC_SUBPIX_VAR(4, 4)
1248
1249 OBMC_VAR(4, 8)
1250 OBMC_SUBPIX_VAR(4, 8)
1251
1252 OBMC_VAR(8, 4)
1253 OBMC_SUBPIX_VAR(8, 4)
1254
1255 OBMC_VAR(8, 8)
1256 OBMC_SUBPIX_VAR(8, 8)
1257
1258 OBMC_VAR(8, 16)
1259 OBMC_SUBPIX_VAR(8, 16)
1260
1261 OBMC_VAR(16, 8)
1262 OBMC_SUBPIX_VAR(16, 8)
1263
1264 OBMC_VAR(16, 16)
1265 OBMC_SUBPIX_VAR(16, 16)
1266
1267 OBMC_VAR(16, 32)
1268 OBMC_SUBPIX_VAR(16, 32)
1269
1270 OBMC_VAR(32, 16)
1271 OBMC_SUBPIX_VAR(32, 16)
1272
1273 OBMC_VAR(32, 32)
1274 OBMC_SUBPIX_VAR(32, 32)
1275
1276 OBMC_VAR(32, 64)
1277 OBMC_SUBPIX_VAR(32, 64)
1278
1279 OBMC_VAR(64, 32)
1280 OBMC_SUBPIX_VAR(64, 32)
1281
1282 OBMC_VAR(64, 64)
1283 OBMC_SUBPIX_VAR(64, 64)
1284
1285 OBMC_VAR(64, 128)
1286 OBMC_SUBPIX_VAR(64, 128)
1287
1288 OBMC_VAR(128, 64)
1289 OBMC_SUBPIX_VAR(128, 64)
1290
1291 OBMC_VAR(128, 128)
1292 OBMC_SUBPIX_VAR(128, 128)
1293
1294 OBMC_VAR(4, 16)
1295 OBMC_SUBPIX_VAR(4, 16)
1296 OBMC_VAR(16, 4)
1297 OBMC_SUBPIX_VAR(16, 4)
1298 OBMC_VAR(8, 32)
1299 OBMC_SUBPIX_VAR(8, 32)
1300 OBMC_VAR(32, 8)
1301 OBMC_SUBPIX_VAR(32, 8)
1302 OBMC_VAR(16, 64)
1303 OBMC_SUBPIX_VAR(16, 64)
1304 OBMC_VAR(64, 16)
1305 OBMC_SUBPIX_VAR(64, 16)
1306
1307 #if CONFIG_AV1_HIGHBITDEPTH
highbd_obmc_variance64(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,uint64_t * sse,int64_t * sum)1308 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
1309 const int32_t *wsrc,
1310 const int32_t *mask, int w, int h,
1311 uint64_t *sse, int64_t *sum) {
1312 int i, j;
1313 uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
1314
1315 *sse = 0;
1316 *sum = 0;
1317
1318 for (i = 0; i < h; i++) {
1319 for (j = 0; j < w; j++) {
1320 int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1321 *sum += diff;
1322 *sse += diff * diff;
1323 }
1324
1325 pre += pre_stride;
1326 wsrc += w;
1327 mask += w;
1328 }
1329 }
1330
highbd_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1331 static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
1332 const int32_t *wsrc,
1333 const int32_t *mask, int w, int h,
1334 unsigned int *sse, int *sum) {
1335 int64_t sum64;
1336 uint64_t sse64;
1337 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1338 *sum = (int)sum64;
1339 *sse = (unsigned int)sse64;
1340 }
1341
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1342 static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
1343 const int32_t *wsrc,
1344 const int32_t *mask, int w, int h,
1345 unsigned int *sse, int *sum) {
1346 int64_t sum64;
1347 uint64_t sse64;
1348 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1349 *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
1350 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
1351 }
1352
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1353 static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
1354 const int32_t *wsrc,
1355 const int32_t *mask, int w, int h,
1356 unsigned int *sse, int *sum) {
1357 int64_t sum64;
1358 uint64_t sse64;
1359 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1360 *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
1361 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
1362 }
1363
1364 #define HIGHBD_OBMC_VAR(W, H) \
1365 unsigned int aom_highbd_obmc_variance##W##x##H##_c( \
1366 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1367 const int32_t *mask, unsigned int *sse) { \
1368 int sum; \
1369 highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1370 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
1371 } \
1372 \
1373 unsigned int aom_highbd_10_obmc_variance##W##x##H##_c( \
1374 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1375 const int32_t *mask, unsigned int *sse) { \
1376 int sum; \
1377 int64_t var; \
1378 highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1379 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
1380 return (var >= 0) ? (uint32_t)var : 0; \
1381 } \
1382 \
1383 unsigned int aom_highbd_12_obmc_variance##W##x##H##_c( \
1384 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1385 const int32_t *mask, unsigned int *sse) { \
1386 int sum; \
1387 int64_t var; \
1388 highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1389 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
1390 return (var >= 0) ? (uint32_t)var : 0; \
1391 }
1392
1393 #define HIGHBD_OBMC_SUBPIX_VAR(W, H) \
1394 unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c( \
1395 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1396 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1397 uint16_t fdata3[(H + 1) * W]; \
1398 uint16_t temp2[H * W]; \
1399 \
1400 aom_highbd_var_filter_block2d_bil_first_pass( \
1401 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1402 aom_highbd_var_filter_block2d_bil_second_pass( \
1403 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1404 \
1405 return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
1406 wsrc, mask, sse); \
1407 } \
1408 \
1409 unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \
1410 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1411 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1412 uint16_t fdata3[(H + 1) * W]; \
1413 uint16_t temp2[H * W]; \
1414 \
1415 aom_highbd_var_filter_block2d_bil_first_pass( \
1416 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1417 aom_highbd_var_filter_block2d_bil_second_pass( \
1418 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1419 \
1420 return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1421 W, wsrc, mask, sse); \
1422 } \
1423 \
1424 unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \
1425 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1426 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1427 uint16_t fdata3[(H + 1) * W]; \
1428 uint16_t temp2[H * W]; \
1429 \
1430 aom_highbd_var_filter_block2d_bil_first_pass( \
1431 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1432 aom_highbd_var_filter_block2d_bil_second_pass( \
1433 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1434 \
1435 return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1436 W, wsrc, mask, sse); \
1437 }
1438
1439 HIGHBD_OBMC_VAR(4, 4)
1440 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
1441
1442 HIGHBD_OBMC_VAR(4, 8)
1443 HIGHBD_OBMC_SUBPIX_VAR(4, 8)
1444
1445 HIGHBD_OBMC_VAR(8, 4)
1446 HIGHBD_OBMC_SUBPIX_VAR(8, 4)
1447
1448 HIGHBD_OBMC_VAR(8, 8)
1449 HIGHBD_OBMC_SUBPIX_VAR(8, 8)
1450
1451 HIGHBD_OBMC_VAR(8, 16)
1452 HIGHBD_OBMC_SUBPIX_VAR(8, 16)
1453
1454 HIGHBD_OBMC_VAR(16, 8)
1455 HIGHBD_OBMC_SUBPIX_VAR(16, 8)
1456
1457 HIGHBD_OBMC_VAR(16, 16)
1458 HIGHBD_OBMC_SUBPIX_VAR(16, 16)
1459
1460 HIGHBD_OBMC_VAR(16, 32)
1461 HIGHBD_OBMC_SUBPIX_VAR(16, 32)
1462
1463 HIGHBD_OBMC_VAR(32, 16)
1464 HIGHBD_OBMC_SUBPIX_VAR(32, 16)
1465
1466 HIGHBD_OBMC_VAR(32, 32)
1467 HIGHBD_OBMC_SUBPIX_VAR(32, 32)
1468
1469 HIGHBD_OBMC_VAR(32, 64)
1470 HIGHBD_OBMC_SUBPIX_VAR(32, 64)
1471
1472 HIGHBD_OBMC_VAR(64, 32)
1473 HIGHBD_OBMC_SUBPIX_VAR(64, 32)
1474
1475 HIGHBD_OBMC_VAR(64, 64)
1476 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
1477
1478 HIGHBD_OBMC_VAR(64, 128)
1479 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
1480
1481 HIGHBD_OBMC_VAR(128, 64)
1482 HIGHBD_OBMC_SUBPIX_VAR(128, 64)
1483
1484 HIGHBD_OBMC_VAR(128, 128)
1485 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
1486
1487 HIGHBD_OBMC_VAR(4, 16)
1488 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
1489 HIGHBD_OBMC_VAR(16, 4)
1490 HIGHBD_OBMC_SUBPIX_VAR(16, 4)
1491 HIGHBD_OBMC_VAR(8, 32)
1492 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
1493 HIGHBD_OBMC_VAR(32, 8)
1494 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
1495 HIGHBD_OBMC_VAR(16, 64)
1496 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
1497 HIGHBD_OBMC_VAR(64, 16)
1498 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
1499 #endif // CONFIG_AV1_HIGHBITDEPTH
1500 #endif // !CONFIG_REALTIME_ONLY
1501
aom_mse_wxh_16bit_c(uint8_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1502 uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
1503 int sstride, int w, int h) {
1504 uint64_t sum = 0;
1505 for (int i = 0; i < h; i++) {
1506 for (int j = 0; j < w; j++) {
1507 int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
1508 sum += e * e;
1509 }
1510 }
1511 return sum;
1512 }
1513
aom_mse_wxh_16bit_highbd_c(uint16_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1514 uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
1515 int sstride, int w, int h) {
1516 uint64_t sum = 0;
1517 for (int i = 0; i < h; i++) {
1518 for (int j = 0; j < w; j++) {
1519 int e = dst[i * dstride + j] - src[i * sstride + j];
1520 sum += e * e;
1521 }
1522 }
1523 return sum;
1524 }
1525