1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <assert.h>
12 #include <stdlib.h>
13 #include <string.h>
14 
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 #include "config/av1_rtcd.h"
18 
19 #include "aom/aom_integer.h"
20 #include "aom_ports/mem.h"
21 
22 #include "aom_dsp/aom_filter.h"
23 #include "aom_dsp/blend.h"
24 #include "aom_dsp/variance.h"
25 
26 #include "av1/common/av1_common_int.h"
27 #include "av1/common/filter.h"
28 #include "av1/common/reconinter.h"
29 #include "av1/encoder/reconinter_enc.h"
30 
aom_get4x4sse_cs_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride)31 uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
32                             int b_stride) {
33   int distortion = 0;
34   int r, c;
35 
36   for (r = 0; r < 4; ++r) {
37     for (c = 0; c < 4; ++c) {
38       int diff = a[c] - b[c];
39       distortion += diff * diff;
40     }
41 
42     a += a_stride;
43     b += b_stride;
44   }
45 
46   return distortion;
47 }
48 
aom_get_mb_ss_c(const int16_t * a)49 uint32_t aom_get_mb_ss_c(const int16_t *a) {
50   unsigned int i, sum = 0;
51 
52   for (i = 0; i < 256; ++i) {
53     sum += a[i] * a[i];
54   }
55 
56   return sum;
57 }
58 
variance(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h,uint32_t * sse,int * sum)59 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
60                      int b_stride, int w, int h, uint32_t *sse, int *sum) {
61   int i, j;
62 
63   *sum = 0;
64   *sse = 0;
65 
66   for (i = 0; i < h; ++i) {
67     for (j = 0; j < w; ++j) {
68       const int diff = a[j] - b[j];
69       *sum += diff;
70       *sse += diff * diff;
71     }
72 
73     a += a_stride;
74     b += b_stride;
75   }
76 }
77 
aom_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)78 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
79                           int b_stride, int w, int h) {
80   uint32_t sse;
81   int sum;
82   variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
83   return sse;
84 }
85 
86 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
87 // or vertical direction to produce the filtered output block. Used to implement
88 // the first-pass of 2-D separable filter.
89 //
90 // Produces int16_t output to retain precision for the next pass. Two filter
91 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
92 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
93 // It defines the offset required to move from one input to the next.
aom_var_filter_block2d_bil_first_pass_c(const uint8_t * a,uint16_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)94 void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b,
95                                              unsigned int src_pixels_per_line,
96                                              unsigned int pixel_step,
97                                              unsigned int output_height,
98                                              unsigned int output_width,
99                                              const uint8_t *filter) {
100   unsigned int i, j;
101 
102   for (i = 0; i < output_height; ++i) {
103     for (j = 0; j < output_width; ++j) {
104       b[j] = ROUND_POWER_OF_TWO(
105           (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
106 
107       ++a;
108     }
109 
110     a += src_pixels_per_line - output_width;
111     b += output_width;
112   }
113 }
114 
115 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
116 // or vertical direction to produce the filtered output block. Used to implement
117 // the second-pass of 2-D separable filter.
118 //
119 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
120 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
121 // filter is applied horizontally (pixel_step = 1) or vertically
122 // (pixel_step = stride). It defines the offset required to move from one input
123 // to the next. Output is 8-bit.
aom_var_filter_block2d_bil_second_pass_c(const uint16_t * a,uint8_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)124 void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
125                                               unsigned int src_pixels_per_line,
126                                               unsigned int pixel_step,
127                                               unsigned int output_height,
128                                               unsigned int output_width,
129                                               const uint8_t *filter) {
130   unsigned int i, j;
131 
132   for (i = 0; i < output_height; ++i) {
133     for (j = 0; j < output_width; ++j) {
134       b[j] = ROUND_POWER_OF_TWO(
135           (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
136       ++a;
137     }
138 
139     a += src_pixels_per_line - output_width;
140     b += output_width;
141   }
142 }
143 
144 #define VAR(W, H)                                                    \
145   uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
146                                      const uint8_t *b, int b_stride, \
147                                      uint32_t *sse) {                \
148     int sum;                                                         \
149     variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
150     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
151   }
152 
153 #define SUBPIX_VAR(W, H)                                                      \
154   uint32_t aom_sub_pixel_variance##W##x##H##_c(                               \
155       const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
156       const uint8_t *b, int b_stride, uint32_t *sse) {                        \
157     uint16_t fdata3[(H + 1) * W];                                             \
158     uint8_t temp2[H * W];                                                     \
159                                                                               \
160     aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
161                                             bilinear_filters_2t[xoffset]);    \
162     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
163                                              bilinear_filters_2t[yoffset]);   \
164                                                                               \
165     return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);             \
166   }
167 
168 #define SUBPIX_AVG_VAR(W, H)                                                   \
169   uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                            \
170       const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
171       const uint8_t *b, int b_stride, uint32_t *sse,                           \
172       const uint8_t *second_pred) {                                            \
173     uint16_t fdata3[(H + 1) * W];                                              \
174     uint8_t temp2[H * W];                                                      \
175     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
176                                                                                \
177     aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,  \
178                                             bilinear_filters_2t[xoffset]);     \
179     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
180                                              bilinear_filters_2t[yoffset]);    \
181                                                                                \
182     aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);                     \
183                                                                                \
184     return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);              \
185   }                                                                            \
186   uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(                   \
187       const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
188       const uint8_t *b, int b_stride, uint32_t *sse,                           \
189       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
190     uint16_t fdata3[(H + 1) * W];                                              \
191     uint8_t temp2[H * W];                                                      \
192     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
193                                                                                \
194     aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,  \
195                                             bilinear_filters_2t[xoffset]);     \
196     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
197                                              bilinear_filters_2t[yoffset]);    \
198                                                                                \
199     aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
200                                                                                \
201     return aom_variance##W##x##H(temp3, W, b, b_stride, sse);                  \
202   }
203 
204 /* Identical to the variance call except it takes an additional parameter, sum,
205  * and returns that value using pass-by-reference instead of returning
206  * sse - sum^2 / w*h
207  */
208 #define GET_VAR(W, H)                                                         \
209   void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride,                \
210                                const uint8_t *b, int b_stride, uint32_t *sse, \
211                                int *sum) {                                    \
212     variance(a, a_stride, b, b_stride, W, H, sse, sum);                       \
213   }
214 
215 /* Identical to the variance call except it does not calculate the
216  * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
217  * variable.
218  */
219 #define MSE(W, H)                                               \
220   uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
221                                 const uint8_t *b, int b_stride, \
222                                 uint32_t *sse) {                \
223     int sum;                                                    \
224     variance(a, a_stride, b, b_stride, W, H, sse, &sum);        \
225     return *sse;                                                \
226   }
227 
228 /* All three forms of the variance are available in the same sizes. */
229 #define VARIANCES(W, H) \
230   VAR(W, H)             \
231   SUBPIX_VAR(W, H)      \
232   SUBPIX_AVG_VAR(W, H)
233 
234 VARIANCES(128, 128)
235 VARIANCES(128, 64)
236 VARIANCES(64, 128)
237 VARIANCES(64, 64)
238 VARIANCES(64, 32)
239 VARIANCES(32, 64)
240 VARIANCES(32, 32)
241 VARIANCES(32, 16)
242 VARIANCES(16, 32)
243 VARIANCES(16, 16)
244 VARIANCES(16, 8)
245 VARIANCES(8, 16)
246 VARIANCES(8, 8)
247 VARIANCES(8, 4)
248 VARIANCES(4, 8)
249 VARIANCES(4, 4)
250 VARIANCES(4, 2)
251 VARIANCES(2, 4)
252 VARIANCES(2, 2)
253 
254 // Realtime mode doesn't use rectangular blocks.
255 #if !CONFIG_REALTIME_ONLY
256 VARIANCES(4, 16)
257 VARIANCES(16, 4)
258 VARIANCES(8, 32)
259 VARIANCES(32, 8)
260 VARIANCES(16, 64)
261 VARIANCES(64, 16)
262 #endif
263 
264 GET_VAR(16, 16)
265 GET_VAR(8, 8)
266 
267 MSE(16, 16)
268 MSE(16, 8)
269 MSE(8, 16)
270 MSE(8, 8)
271 
aom_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride)272 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
273                          int height, const uint8_t *ref, int ref_stride) {
274   int i, j;
275 
276   for (i = 0; i < height; ++i) {
277     for (j = 0; j < width; ++j) {
278       const int tmp = pred[j] + ref[j];
279       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
280     }
281     comp_pred += width;
282     pred += width;
283     ref += ref_stride;
284   }
285 }
286 
287 // Get pred block from up-sampled reference.
aom_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,int subpel_search)288 void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
289                           int mi_row, int mi_col, const MV *const mv,
290                           uint8_t *comp_pred, int width, int height,
291                           int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
292                           int ref_stride, int subpel_search) {
293   // expect xd == NULL only in tests
294   if (xd != NULL) {
295     const MB_MODE_INFO *mi = xd->mi[0];
296     const int ref_num = 0;
297     const int is_intrabc = is_intrabc_block(mi);
298     const struct scale_factors *const sf =
299         is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
300     const int is_scaled = av1_is_scaled(sf);
301 
302     if (is_scaled) {
303       int plane = 0;
304       const int mi_x = mi_col * MI_SIZE;
305       const int mi_y = mi_row * MI_SIZE;
306       const struct macroblockd_plane *const pd = &xd->plane[plane];
307       const struct buf_2d *const dst_buf = &pd->dst;
308       const struct buf_2d *const pre_buf =
309           is_intrabc ? dst_buf : &pd->pre[ref_num];
310 
311       InterPredParams inter_pred_params;
312       inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
313       const int_interpfilters filters =
314           av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
315       av1_init_inter_params(
316           &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
317           mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
318           xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
319       av1_enc_build_one_inter_predictor(comp_pred, width, mv,
320                                         &inter_pred_params);
321       return;
322     }
323   }
324 
325   const InterpFilterParams *filter = av1_get_filter(subpel_search);
326 
327   if (!subpel_x_q3 && !subpel_y_q3) {
328     for (int i = 0; i < height; i++) {
329       memcpy(comp_pred, ref, width * sizeof(*comp_pred));
330       comp_pred += width;
331       ref += ref_stride;
332     }
333   } else if (!subpel_y_q3) {
334     const int16_t *const kernel =
335         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
336     aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
337                           -1, width, height);
338   } else if (!subpel_x_q3) {
339     const int16_t *const kernel =
340         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
341     aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
342                          16, width, height);
343   } else {
344     DECLARE_ALIGNED(16, uint8_t,
345                     temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
346     const int16_t *const kernel_x =
347         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
348     const int16_t *const kernel_y =
349         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
350     const int intermediate_height =
351         (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
352     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
353     aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
354                           ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
355                           width, intermediate_height);
356     aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
357                          MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
358                          width, height);
359   }
360 }
361 
aom_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,int subpel_search)362 void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
363                                    int mi_row, int mi_col, const MV *const mv,
364                                    uint8_t *comp_pred, const uint8_t *pred,
365                                    int width, int height, int subpel_x_q3,
366                                    int subpel_y_q3, const uint8_t *ref,
367                                    int ref_stride, int subpel_search) {
368   int i, j;
369 
370   aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
371                        subpel_x_q3, subpel_y_q3, ref, ref_stride,
372                        subpel_search);
373   for (i = 0; i < height; i++) {
374     for (j = 0; j < width; j++) {
375       comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
376     }
377     comp_pred += width;
378     pred += width;
379   }
380 }
381 
aom_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)382 void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
383                                   int width, int height, const uint8_t *ref,
384                                   int ref_stride,
385                                   const DIST_WTD_COMP_PARAMS *jcp_param) {
386   int i, j;
387   const int fwd_offset = jcp_param->fwd_offset;
388   const int bck_offset = jcp_param->bck_offset;
389 
390   for (i = 0; i < height; ++i) {
391     for (j = 0; j < width; ++j) {
392       int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
393       tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
394       comp_pred[j] = (uint8_t)tmp;
395     }
396     comp_pred += width;
397     pred += width;
398     ref += ref_stride;
399   }
400 }
401 
aom_dist_wtd_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param,int subpel_search)402 void aom_dist_wtd_comp_avg_upsampled_pred_c(
403     MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
404     const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
405     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
406     int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
407   int i, j;
408   const int fwd_offset = jcp_param->fwd_offset;
409   const int bck_offset = jcp_param->bck_offset;
410 
411   aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
412                        subpel_x_q3, subpel_y_q3, ref, ref_stride,
413                        subpel_search);
414 
415   for (i = 0; i < height; i++) {
416     for (j = 0; j < width; j++) {
417       int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
418       tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
419       comp_pred[j] = (uint8_t)tmp;
420     }
421     comp_pred += width;
422     pred += width;
423   }
424 }
425 
426 #if CONFIG_AV1_HIGHBITDEPTH
highbd_variance64(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint64_t * sse,int64_t * sum)427 static void highbd_variance64(const uint8_t *a8, int a_stride,
428                               const uint8_t *b8, int b_stride, int w, int h,
429                               uint64_t *sse, int64_t *sum) {
430   const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
431   const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
432   int64_t tsum = 0;
433   uint64_t tsse = 0;
434   for (int i = 0; i < h; ++i) {
435     int32_t lsum = 0;
436     for (int j = 0; j < w; ++j) {
437       const int diff = a[j] - b[j];
438       lsum += diff;
439       tsse += (uint32_t)(diff * diff);
440     }
441     tsum += lsum;
442     a += a_stride;
443     b += b_stride;
444   }
445   *sum = tsum;
446   *sse = tsse;
447 }
448 
aom_highbd_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)449 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
450                                  const uint8_t *b, int b_stride, int w, int h) {
451   uint64_t sse;
452   int64_t sum;
453   highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
454   return sse;
455 }
456 
highbd_8_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)457 static void highbd_8_variance(const uint8_t *a8, int a_stride,
458                               const uint8_t *b8, int b_stride, int w, int h,
459                               uint32_t *sse, int *sum) {
460   uint64_t sse_long = 0;
461   int64_t sum_long = 0;
462   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
463   *sse = (uint32_t)sse_long;
464   *sum = (int)sum_long;
465 }
466 
highbd_10_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)467 static void highbd_10_variance(const uint8_t *a8, int a_stride,
468                                const uint8_t *b8, int b_stride, int w, int h,
469                                uint32_t *sse, int *sum) {
470   uint64_t sse_long = 0;
471   int64_t sum_long = 0;
472   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
473   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
474   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
475 }
476 
highbd_12_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)477 static void highbd_12_variance(const uint8_t *a8, int a_stride,
478                                const uint8_t *b8, int b_stride, int w, int h,
479                                uint32_t *sse, int *sum) {
480   uint64_t sse_long = 0;
481   int64_t sum_long = 0;
482   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
483   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
484   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
485 }
486 
487 #define HIGHBD_VAR(W, H)                                                       \
488   uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride,  \
489                                               const uint8_t *b, int b_stride,  \
490                                               uint32_t *sse) {                 \
491     int sum;                                                                   \
492     highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum);              \
493     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));                  \
494   }                                                                            \
495                                                                                \
496   uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
497                                                const uint8_t *b, int b_stride, \
498                                                uint32_t *sse) {                \
499     int sum;                                                                   \
500     int64_t var;                                                               \
501     highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
502     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
503     return (var >= 0) ? (uint32_t)var : 0;                                     \
504   }                                                                            \
505                                                                                \
506   uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
507                                                const uint8_t *b, int b_stride, \
508                                                uint32_t *sse) {                \
509     int sum;                                                                   \
510     int64_t var;                                                               \
511     highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
512     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
513     return (var >= 0) ? (uint32_t)var : 0;                                     \
514   }
515 
516 #define HIGHBD_GET_VAR(S)                                                    \
517   void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride,  \
518                                         const uint8_t *ref, int ref_stride,  \
519                                         uint32_t *sse, int *sum) {           \
520     highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);     \
521   }                                                                          \
522                                                                              \
523   void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
524                                          const uint8_t *ref, int ref_stride, \
525                                          uint32_t *sse, int *sum) {          \
526     highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
527   }                                                                          \
528                                                                              \
529   void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
530                                          const uint8_t *ref, int ref_stride, \
531                                          uint32_t *sse, int *sum) {          \
532     highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
533   }
534 
535 #define HIGHBD_MSE(W, H)                                                      \
536   uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride,  \
537                                          const uint8_t *ref, int ref_stride,  \
538                                          uint32_t *sse) {                     \
539     int sum;                                                                  \
540     highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);     \
541     return *sse;                                                              \
542   }                                                                           \
543                                                                               \
544   uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
545                                           const uint8_t *ref, int ref_stride, \
546                                           uint32_t *sse) {                    \
547     int sum;                                                                  \
548     highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
549     return *sse;                                                              \
550   }                                                                           \
551                                                                               \
552   uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
553                                           const uint8_t *ref, int ref_stride, \
554                                           uint32_t *sse) {                    \
555     int sum;                                                                  \
556     highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
557     return *sse;                                                              \
558   }
559 
aom_highbd_var_filter_block2d_bil_first_pass(const uint8_t * src_ptr8,uint16_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)560 void aom_highbd_var_filter_block2d_bil_first_pass(
561     const uint8_t *src_ptr8, uint16_t *output_ptr,
562     unsigned int src_pixels_per_line, int pixel_step,
563     unsigned int output_height, unsigned int output_width,
564     const uint8_t *filter) {
565   unsigned int i, j;
566   uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
567   for (i = 0; i < output_height; ++i) {
568     for (j = 0; j < output_width; ++j) {
569       output_ptr[j] = ROUND_POWER_OF_TWO(
570           (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
571           FILTER_BITS);
572 
573       ++src_ptr;
574     }
575 
576     // Next row...
577     src_ptr += src_pixels_per_line - output_width;
578     output_ptr += output_width;
579   }
580 }
581 
aom_highbd_var_filter_block2d_bil_second_pass(const uint16_t * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)582 void aom_highbd_var_filter_block2d_bil_second_pass(
583     const uint16_t *src_ptr, uint16_t *output_ptr,
584     unsigned int src_pixels_per_line, unsigned int pixel_step,
585     unsigned int output_height, unsigned int output_width,
586     const uint8_t *filter) {
587   unsigned int i, j;
588 
589   for (i = 0; i < output_height; ++i) {
590     for (j = 0; j < output_width; ++j) {
591       output_ptr[j] = ROUND_POWER_OF_TWO(
592           (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
593           FILTER_BITS);
594       ++src_ptr;
595     }
596 
597     src_ptr += src_pixels_per_line - output_width;
598     output_ptr += output_width;
599   }
600 }
601 
602 #define HIGHBD_SUBPIX_VAR(W, H)                                              \
603   uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c(                     \
604       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
605       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
606     uint16_t fdata3[(H + 1) * W];                                            \
607     uint16_t temp2[H * W];                                                   \
608                                                                              \
609     aom_highbd_var_filter_block2d_bil_first_pass(                            \
610         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
611     aom_highbd_var_filter_block2d_bil_second_pass(                           \
612         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
613                                                                              \
614     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
615                                               dst, dst_stride, sse);         \
616   }                                                                          \
617                                                                              \
618   uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c(                    \
619       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
620       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
621     uint16_t fdata3[(H + 1) * W];                                            \
622     uint16_t temp2[H * W];                                                   \
623                                                                              \
624     aom_highbd_var_filter_block2d_bil_first_pass(                            \
625         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
626     aom_highbd_var_filter_block2d_bil_second_pass(                           \
627         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
628                                                                              \
629     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
630                                                dst, dst_stride, sse);        \
631   }                                                                          \
632                                                                              \
633   uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c(                    \
634       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
635       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
636     uint16_t fdata3[(H + 1) * W];                                            \
637     uint16_t temp2[H * W];                                                   \
638                                                                              \
639     aom_highbd_var_filter_block2d_bil_first_pass(                            \
640         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
641     aom_highbd_var_filter_block2d_bil_second_pass(                           \
642         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
643                                                                              \
644     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
645                                                dst, dst_stride, sse);        \
646   }
647 
648 #define HIGHBD_SUBPIX_AVG_VAR(W, H)                                           \
649   uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                  \
650       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
651       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
652       const uint8_t *second_pred) {                                           \
653     uint16_t fdata3[(H + 1) * W];                                             \
654     uint16_t temp2[H * W];                                                    \
655     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
656                                                                               \
657     aom_highbd_var_filter_block2d_bil_first_pass(                             \
658         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
659     aom_highbd_var_filter_block2d_bil_second_pass(                            \
660         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
661                                                                               \
662     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
663                                CONVERT_TO_BYTEPTR(temp2), W);                 \
664                                                                               \
665     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
666                                               dst, dst_stride, sse);          \
667   }                                                                           \
668                                                                               \
669   uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                 \
670       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
671       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
672       const uint8_t *second_pred) {                                           \
673     uint16_t fdata3[(H + 1) * W];                                             \
674     uint16_t temp2[H * W];                                                    \
675     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
676                                                                               \
677     aom_highbd_var_filter_block2d_bil_first_pass(                             \
678         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
679     aom_highbd_var_filter_block2d_bil_second_pass(                            \
680         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
681                                                                               \
682     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
683                                CONVERT_TO_BYTEPTR(temp2), W);                 \
684                                                                               \
685     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
686                                                dst, dst_stride, sse);         \
687   }                                                                           \
688                                                                               \
689   uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                 \
690       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
691       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
692       const uint8_t *second_pred) {                                           \
693     uint16_t fdata3[(H + 1) * W];                                             \
694     uint16_t temp2[H * W];                                                    \
695     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
696                                                                               \
697     aom_highbd_var_filter_block2d_bil_first_pass(                             \
698         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
699     aom_highbd_var_filter_block2d_bil_second_pass(                            \
700         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
701                                                                               \
702     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
703                                CONVERT_TO_BYTEPTR(temp2), W);                 \
704                                                                               \
705     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
706                                                dst, dst_stride, sse);         \
707   }                                                                           \
708                                                                               \
709   uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(         \
710       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
711       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
712       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
713     uint16_t fdata3[(H + 1) * W];                                             \
714     uint16_t temp2[H * W];                                                    \
715     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
716                                                                               \
717     aom_highbd_var_filter_block2d_bil_first_pass(                             \
718         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
719     aom_highbd_var_filter_block2d_bil_second_pass(                            \
720         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
721                                                                               \
722     aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
723                                       W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
724                                       jcp_param);                             \
725                                                                               \
726     return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,  \
727                                           dst_stride, sse);                   \
728   }                                                                           \
729                                                                               \
730   uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(        \
731       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
732       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
733       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
734     uint16_t fdata3[(H + 1) * W];                                             \
735     uint16_t temp2[H * W];                                                    \
736     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
737                                                                               \
738     aom_highbd_var_filter_block2d_bil_first_pass(                             \
739         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
740     aom_highbd_var_filter_block2d_bil_second_pass(                            \
741         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
742                                                                               \
743     aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
744                                       W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
745                                       jcp_param);                             \
746                                                                               \
747     return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
748                                            dst_stride, sse);                  \
749   }                                                                           \
750                                                                               \
751   uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(        \
752       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
753       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
754       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
755     uint16_t fdata3[(H + 1) * W];                                             \
756     uint16_t temp2[H * W];                                                    \
757     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
758                                                                               \
759     aom_highbd_var_filter_block2d_bil_first_pass(                             \
760         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
761     aom_highbd_var_filter_block2d_bil_second_pass(                            \
762         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
763                                                                               \
764     aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
765                                       W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
766                                       jcp_param);                             \
767                                                                               \
768     return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
769                                            dst_stride, sse);                  \
770   }
771 
772 /* All three forms of the variance are available in the same sizes. */
773 #define HIGHBD_VARIANCES(W, H) \
774   HIGHBD_VAR(W, H)             \
775   HIGHBD_SUBPIX_VAR(W, H)      \
776   HIGHBD_SUBPIX_AVG_VAR(W, H)
777 
778 HIGHBD_VARIANCES(128, 128)
779 HIGHBD_VARIANCES(128, 64)
780 HIGHBD_VARIANCES(64, 128)
781 HIGHBD_VARIANCES(64, 64)
782 HIGHBD_VARIANCES(64, 32)
783 HIGHBD_VARIANCES(32, 64)
784 HIGHBD_VARIANCES(32, 32)
785 HIGHBD_VARIANCES(32, 16)
786 HIGHBD_VARIANCES(16, 32)
787 HIGHBD_VARIANCES(16, 16)
788 HIGHBD_VARIANCES(16, 8)
789 HIGHBD_VARIANCES(8, 16)
790 HIGHBD_VARIANCES(8, 8)
791 HIGHBD_VARIANCES(8, 4)
792 HIGHBD_VARIANCES(4, 8)
793 HIGHBD_VARIANCES(4, 4)
794 HIGHBD_VARIANCES(4, 2)
795 HIGHBD_VARIANCES(2, 4)
796 HIGHBD_VARIANCES(2, 2)
797 
798 // Realtime mode doesn't use 4x rectangular blocks.
799 #if !CONFIG_REALTIME_ONLY
800 HIGHBD_VARIANCES(4, 16)
801 HIGHBD_VARIANCES(16, 4)
802 HIGHBD_VARIANCES(8, 32)
803 HIGHBD_VARIANCES(32, 8)
804 HIGHBD_VARIANCES(16, 64)
805 HIGHBD_VARIANCES(64, 16)
806 #endif
807 
808 HIGHBD_GET_VAR(8)
809 HIGHBD_GET_VAR(16)
810 
811 HIGHBD_MSE(16, 16)
812 HIGHBD_MSE(16, 8)
813 HIGHBD_MSE(8, 16)
814 HIGHBD_MSE(8, 8)
815 
aom_highbd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride)816 void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
817                                 int width, int height, const uint8_t *ref8,
818                                 int ref_stride) {
819   int i, j;
820   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
821   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
822   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
823   for (i = 0; i < height; ++i) {
824     for (j = 0; j < width; ++j) {
825       const int tmp = pred[j] + ref[j];
826       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
827     }
828     comp_pred += width;
829     pred += width;
830     ref += ref_stride;
831   }
832 }
833 
aom_highbd_upsampled_pred_c(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,int subpel_search)834 void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
835                                  const struct AV1Common *const cm, int mi_row,
836                                  int mi_col, const MV *const mv,
837                                  uint8_t *comp_pred8, int width, int height,
838                                  int subpel_x_q3, int subpel_y_q3,
839                                  const uint8_t *ref8, int ref_stride, int bd,
840                                  int subpel_search) {
841   // expect xd == NULL only in tests
842   if (xd != NULL) {
843     const MB_MODE_INFO *mi = xd->mi[0];
844     const int ref_num = 0;
845     const int is_intrabc = is_intrabc_block(mi);
846     const struct scale_factors *const sf =
847         is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
848     const int is_scaled = av1_is_scaled(sf);
849 
850     if (is_scaled) {
851       int plane = 0;
852       const int mi_x = mi_col * MI_SIZE;
853       const int mi_y = mi_row * MI_SIZE;
854       const struct macroblockd_plane *const pd = &xd->plane[plane];
855       const struct buf_2d *const dst_buf = &pd->dst;
856       const struct buf_2d *const pre_buf =
857           is_intrabc ? dst_buf : &pd->pre[ref_num];
858 
859       InterPredParams inter_pred_params;
860       inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
861       const int_interpfilters filters =
862           av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
863       av1_init_inter_params(
864           &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
865           mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
866           xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
867       av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
868                                         &inter_pred_params);
869       return;
870     }
871   }
872 
873   const InterpFilterParams *filter = av1_get_filter(subpel_search);
874 
875   if (!subpel_x_q3 && !subpel_y_q3) {
876     const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
877     uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
878     for (int i = 0; i < height; i++) {
879       memcpy(comp_pred, ref, width * sizeof(*comp_pred));
880       comp_pred += width;
881       ref += ref_stride;
882     }
883   } else if (!subpel_y_q3) {
884     const int16_t *const kernel =
885         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
886     aom_highbd_convolve8_horiz_c(ref8, ref_stride, comp_pred8, width, kernel,
887                                  16, NULL, -1, width, height, bd);
888   } else if (!subpel_x_q3) {
889     const int16_t *const kernel =
890         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
891     aom_highbd_convolve8_vert_c(ref8, ref_stride, comp_pred8, width, NULL, -1,
892                                 kernel, 16, width, height, bd);
893   } else {
894     DECLARE_ALIGNED(16, uint16_t,
895                     temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
896     const int16_t *const kernel_x =
897         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
898     const int16_t *const kernel_y =
899         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
900     const int intermediate_height =
901         (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
902     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
903     aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter->taps >> 1) - 1),
904                                  ref_stride, CONVERT_TO_BYTEPTR(temp),
905                                  MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
906                                  intermediate_height, bd);
907     aom_highbd_convolve8_vert_c(
908         CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
909         MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
910         bd);
911   }
912 }
913 
aom_highbd_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,int subpel_search)914 void aom_highbd_comp_avg_upsampled_pred_c(
915     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
916     const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
917     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
918     int ref_stride, int bd, int subpel_search) {
919   int i, j;
920 
921   const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
922   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
923   aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
924                             height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
925                             bd, subpel_search);
926   for (i = 0; i < height; ++i) {
927     for (j = 0; j < width; ++j) {
928       comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
929     }
930     comp_pred += width;
931     pred += width;
932   }
933 }
934 
aom_highbd_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)935 void aom_highbd_dist_wtd_comp_avg_pred_c(
936     uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
937     const uint8_t *ref8, int ref_stride,
938     const DIST_WTD_COMP_PARAMS *jcp_param) {
939   int i, j;
940   const int fwd_offset = jcp_param->fwd_offset;
941   const int bck_offset = jcp_param->bck_offset;
942   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
943   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
944   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
945 
946   for (i = 0; i < height; ++i) {
947     for (j = 0; j < width; ++j) {
948       int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
949       tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
950       comp_pred[j] = (uint16_t)tmp;
951     }
952     comp_pred += width;
953     pred += width;
954     ref += ref_stride;
955   }
956 }
957 
aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,const DIST_WTD_COMP_PARAMS * jcp_param,int subpel_search)958 void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
959     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
960     const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
961     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
962     int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
963     int subpel_search) {
964   int i, j;
965   const int fwd_offset = jcp_param->fwd_offset;
966   const int bck_offset = jcp_param->bck_offset;
967   const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
968   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
969   aom_highbd_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
970                               height, subpel_x_q3, subpel_y_q3, ref8,
971                               ref_stride, bd, subpel_search);
972 
973   for (i = 0; i < height; i++) {
974     for (j = 0; j < width; j++) {
975       int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
976       tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
977       comp_pred[j] = (uint16_t)tmp;
978     }
979     comp_pred += width;
980     pred += width;
981   }
982 }
983 #endif  // CONFIG_AV1_HIGHBITDEPTH
984 
aom_comp_mask_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)985 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
986                           int height, const uint8_t *ref, int ref_stride,
987                           const uint8_t *mask, int mask_stride,
988                           int invert_mask) {
989   int i, j;
990   const uint8_t *src0 = invert_mask ? pred : ref;
991   const uint8_t *src1 = invert_mask ? ref : pred;
992   const int stride0 = invert_mask ? width : ref_stride;
993   const int stride1 = invert_mask ? ref_stride : width;
994   for (i = 0; i < height; ++i) {
995     for (j = 0; j < width; ++j) {
996       comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
997     }
998     comp_pred += width;
999     src0 += stride0;
1000     src1 += stride1;
1001     mask += mask_stride;
1002   }
1003 }
1004 
aom_comp_mask_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask,int subpel_search)1005 void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
1006                                     int mi_row, int mi_col, const MV *const mv,
1007                                     uint8_t *comp_pred, const uint8_t *pred,
1008                                     int width, int height, int subpel_x_q3,
1009                                     int subpel_y_q3, const uint8_t *ref,
1010                                     int ref_stride, const uint8_t *mask,
1011                                     int mask_stride, int invert_mask,
1012                                     int subpel_search) {
1013   if (subpel_x_q3 | subpel_y_q3) {
1014     aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
1015                          subpel_x_q3, subpel_y_q3, ref, ref_stride,
1016                          subpel_search);
1017     ref = comp_pred;
1018     ref_stride = width;
1019   }
1020   aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask,
1021                        mask_stride, invert_mask);
1022 }
1023 
1024 #define MASK_SUBPIX_VAR(W, H)                                                  \
1025   unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                     \
1026       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
1027       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
1028       const uint8_t *msk, int msk_stride, int invert_mask,                     \
1029       unsigned int *sse) {                                                     \
1030     uint16_t fdata3[(H + 1) * W];                                              \
1031     uint8_t temp2[H * W];                                                      \
1032     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
1033                                                                                \
1034     aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \
1035                                             W, bilinear_filters_2t[xoffset]);  \
1036     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
1037                                              bilinear_filters_2t[yoffset]);    \
1038                                                                                \
1039     aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride,  \
1040                          invert_mask);                                         \
1041     return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);          \
1042   }
1043 
1044 MASK_SUBPIX_VAR(4, 4)
1045 MASK_SUBPIX_VAR(4, 8)
1046 MASK_SUBPIX_VAR(8, 4)
1047 MASK_SUBPIX_VAR(8, 8)
1048 MASK_SUBPIX_VAR(8, 16)
1049 MASK_SUBPIX_VAR(16, 8)
1050 MASK_SUBPIX_VAR(16, 16)
1051 MASK_SUBPIX_VAR(16, 32)
1052 MASK_SUBPIX_VAR(32, 16)
1053 MASK_SUBPIX_VAR(32, 32)
1054 MASK_SUBPIX_VAR(32, 64)
1055 MASK_SUBPIX_VAR(64, 32)
1056 MASK_SUBPIX_VAR(64, 64)
1057 MASK_SUBPIX_VAR(64, 128)
1058 MASK_SUBPIX_VAR(128, 64)
1059 MASK_SUBPIX_VAR(128, 128)
1060 
1061 // Realtime mode doesn't use 4x rectangular blocks.
1062 #if !CONFIG_REALTIME_ONLY
1063 MASK_SUBPIX_VAR(4, 16)
1064 MASK_SUBPIX_VAR(16, 4)
1065 MASK_SUBPIX_VAR(8, 32)
1066 MASK_SUBPIX_VAR(32, 8)
1067 MASK_SUBPIX_VAR(16, 64)
1068 MASK_SUBPIX_VAR(64, 16)
1069 #endif
1070 
1071 #if CONFIG_AV1_HIGHBITDEPTH
aom_highbd_comp_mask_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)1072 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
1073                                  int width, int height, const uint8_t *ref8,
1074                                  int ref_stride, const uint8_t *mask,
1075                                  int mask_stride, int invert_mask) {
1076   int i, j;
1077   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
1078   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
1079   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
1080   for (i = 0; i < height; ++i) {
1081     for (j = 0; j < width; ++j) {
1082       if (!invert_mask)
1083         comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
1084       else
1085         comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
1086     }
1087     comp_pred += width;
1088     pred += width;
1089     ref += ref_stride;
1090     mask += mask_stride;
1091   }
1092 }
1093 
aom_highbd_comp_mask_upsampled_pred(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask,int bd,int subpel_search)1094 void aom_highbd_comp_mask_upsampled_pred(
1095     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
1096     const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
1097     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
1098     int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
1099     int bd, int subpel_search) {
1100   aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
1101                             height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
1102                             bd, subpel_search);
1103   aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width,
1104                             mask, mask_stride, invert_mask);
1105 }
1106 
1107 #define HIGHBD_MASK_SUBPIX_VAR(W, H)                                           \
1108   unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c(            \
1109       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
1110       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
1111       const uint8_t *msk, int msk_stride, int invert_mask,                     \
1112       unsigned int *sse) {                                                     \
1113     uint16_t fdata3[(H + 1) * W];                                              \
1114     uint16_t temp2[H * W];                                                     \
1115     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
1116                                                                                \
1117     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1118         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1119     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1120         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1121                                                                                \
1122     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
1123                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
1124                                 invert_mask);                                  \
1125                                                                                \
1126     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,    \
1127                                               ref, ref_stride, sse);           \
1128   }                                                                            \
1129                                                                                \
1130   unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c(           \
1131       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
1132       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
1133       const uint8_t *msk, int msk_stride, int invert_mask,                     \
1134       unsigned int *sse) {                                                     \
1135     uint16_t fdata3[(H + 1) * W];                                              \
1136     uint16_t temp2[H * W];                                                     \
1137     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
1138                                                                                \
1139     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1140         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1141     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1142         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1143                                                                                \
1144     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
1145                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
1146                                 invert_mask);                                  \
1147                                                                                \
1148     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
1149                                                ref, ref_stride, sse);          \
1150   }                                                                            \
1151                                                                                \
1152   unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c(           \
1153       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
1154       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
1155       const uint8_t *msk, int msk_stride, int invert_mask,                     \
1156       unsigned int *sse) {                                                     \
1157     uint16_t fdata3[(H + 1) * W];                                              \
1158     uint16_t temp2[H * W];                                                     \
1159     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
1160                                                                                \
1161     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1162         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1163     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1164         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1165                                                                                \
1166     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
1167                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
1168                                 invert_mask);                                  \
1169                                                                                \
1170     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
1171                                                ref, ref_stride, sse);          \
1172   }
1173 
1174 HIGHBD_MASK_SUBPIX_VAR(4, 4)
1175 HIGHBD_MASK_SUBPIX_VAR(4, 8)
1176 HIGHBD_MASK_SUBPIX_VAR(8, 4)
1177 HIGHBD_MASK_SUBPIX_VAR(8, 8)
1178 HIGHBD_MASK_SUBPIX_VAR(8, 16)
1179 HIGHBD_MASK_SUBPIX_VAR(16, 8)
1180 HIGHBD_MASK_SUBPIX_VAR(16, 16)
1181 HIGHBD_MASK_SUBPIX_VAR(16, 32)
1182 HIGHBD_MASK_SUBPIX_VAR(32, 16)
1183 HIGHBD_MASK_SUBPIX_VAR(32, 32)
1184 HIGHBD_MASK_SUBPIX_VAR(32, 64)
1185 HIGHBD_MASK_SUBPIX_VAR(64, 32)
1186 HIGHBD_MASK_SUBPIX_VAR(64, 64)
1187 HIGHBD_MASK_SUBPIX_VAR(64, 128)
1188 HIGHBD_MASK_SUBPIX_VAR(128, 64)
1189 HIGHBD_MASK_SUBPIX_VAR(128, 128)
1190 #if !CONFIG_REALTIME_ONLY
1191 HIGHBD_MASK_SUBPIX_VAR(4, 16)
1192 HIGHBD_MASK_SUBPIX_VAR(16, 4)
1193 HIGHBD_MASK_SUBPIX_VAR(8, 32)
1194 HIGHBD_MASK_SUBPIX_VAR(32, 8)
1195 HIGHBD_MASK_SUBPIX_VAR(16, 64)
1196 HIGHBD_MASK_SUBPIX_VAR(64, 16)
1197 #endif
1198 #endif  // CONFIG_AV1_HIGHBITDEPTH
1199 
1200 #if !CONFIG_REALTIME_ONLY
obmc_variance(const uint8_t * pre,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1201 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
1202                                  const int32_t *wsrc, const int32_t *mask,
1203                                  int w, int h, unsigned int *sse, int *sum) {
1204   int i, j;
1205 
1206   *sse = 0;
1207   *sum = 0;
1208 
1209   for (i = 0; i < h; i++) {
1210     for (j = 0; j < w; j++) {
1211       int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1212       *sum += diff;
1213       *sse += diff * diff;
1214     }
1215 
1216     pre += pre_stride;
1217     wsrc += w;
1218     mask += w;
1219   }
1220 }
1221 
1222 #define OBMC_VAR(W, H)                                            \
1223   unsigned int aom_obmc_variance##W##x##H##_c(                    \
1224       const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
1225       const int32_t *mask, unsigned int *sse) {                   \
1226     int sum;                                                      \
1227     obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);  \
1228     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
1229   }
1230 
1231 #define OBMC_SUBPIX_VAR(W, H)                                                  \
1232   unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c(                       \
1233       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1234       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1235     uint16_t fdata3[(H + 1) * W];                                              \
1236     uint8_t temp2[H * W];                                                      \
1237                                                                                \
1238     aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \
1239                                             W, bilinear_filters_2t[xoffset]);  \
1240     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
1241                                              bilinear_filters_2t[yoffset]);    \
1242                                                                                \
1243     return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);          \
1244   }
1245 
1246 OBMC_VAR(4, 4)
1247 OBMC_SUBPIX_VAR(4, 4)
1248 
1249 OBMC_VAR(4, 8)
1250 OBMC_SUBPIX_VAR(4, 8)
1251 
1252 OBMC_VAR(8, 4)
1253 OBMC_SUBPIX_VAR(8, 4)
1254 
1255 OBMC_VAR(8, 8)
1256 OBMC_SUBPIX_VAR(8, 8)
1257 
1258 OBMC_VAR(8, 16)
1259 OBMC_SUBPIX_VAR(8, 16)
1260 
1261 OBMC_VAR(16, 8)
1262 OBMC_SUBPIX_VAR(16, 8)
1263 
1264 OBMC_VAR(16, 16)
1265 OBMC_SUBPIX_VAR(16, 16)
1266 
1267 OBMC_VAR(16, 32)
1268 OBMC_SUBPIX_VAR(16, 32)
1269 
1270 OBMC_VAR(32, 16)
1271 OBMC_SUBPIX_VAR(32, 16)
1272 
1273 OBMC_VAR(32, 32)
1274 OBMC_SUBPIX_VAR(32, 32)
1275 
1276 OBMC_VAR(32, 64)
1277 OBMC_SUBPIX_VAR(32, 64)
1278 
1279 OBMC_VAR(64, 32)
1280 OBMC_SUBPIX_VAR(64, 32)
1281 
1282 OBMC_VAR(64, 64)
1283 OBMC_SUBPIX_VAR(64, 64)
1284 
1285 OBMC_VAR(64, 128)
1286 OBMC_SUBPIX_VAR(64, 128)
1287 
1288 OBMC_VAR(128, 64)
1289 OBMC_SUBPIX_VAR(128, 64)
1290 
1291 OBMC_VAR(128, 128)
1292 OBMC_SUBPIX_VAR(128, 128)
1293 
1294 OBMC_VAR(4, 16)
1295 OBMC_SUBPIX_VAR(4, 16)
1296 OBMC_VAR(16, 4)
1297 OBMC_SUBPIX_VAR(16, 4)
1298 OBMC_VAR(8, 32)
1299 OBMC_SUBPIX_VAR(8, 32)
1300 OBMC_VAR(32, 8)
1301 OBMC_SUBPIX_VAR(32, 8)
1302 OBMC_VAR(16, 64)
1303 OBMC_SUBPIX_VAR(16, 64)
1304 OBMC_VAR(64, 16)
1305 OBMC_SUBPIX_VAR(64, 16)
1306 
1307 #if CONFIG_AV1_HIGHBITDEPTH
highbd_obmc_variance64(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,uint64_t * sse,int64_t * sum)1308 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
1309                                           const int32_t *wsrc,
1310                                           const int32_t *mask, int w, int h,
1311                                           uint64_t *sse, int64_t *sum) {
1312   int i, j;
1313   uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
1314 
1315   *sse = 0;
1316   *sum = 0;
1317 
1318   for (i = 0; i < h; i++) {
1319     for (j = 0; j < w; j++) {
1320       int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1321       *sum += diff;
1322       *sse += diff * diff;
1323     }
1324 
1325     pre += pre_stride;
1326     wsrc += w;
1327     mask += w;
1328   }
1329 }
1330 
highbd_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1331 static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
1332                                         const int32_t *wsrc,
1333                                         const int32_t *mask, int w, int h,
1334                                         unsigned int *sse, int *sum) {
1335   int64_t sum64;
1336   uint64_t sse64;
1337   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1338   *sum = (int)sum64;
1339   *sse = (unsigned int)sse64;
1340 }
1341 
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1342 static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
1343                                            const int32_t *wsrc,
1344                                            const int32_t *mask, int w, int h,
1345                                            unsigned int *sse, int *sum) {
1346   int64_t sum64;
1347   uint64_t sse64;
1348   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1349   *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
1350   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
1351 }
1352 
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1353 static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
1354                                            const int32_t *wsrc,
1355                                            const int32_t *mask, int w, int h,
1356                                            unsigned int *sse, int *sum) {
1357   int64_t sum64;
1358   uint64_t sse64;
1359   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1360   *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
1361   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
1362 }
1363 
1364 #define HIGHBD_OBMC_VAR(W, H)                                              \
1365   unsigned int aom_highbd_obmc_variance##W##x##H##_c(                      \
1366       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1367       const int32_t *mask, unsigned int *sse) {                            \
1368     int sum;                                                               \
1369     highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
1370     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
1371   }                                                                        \
1372                                                                            \
1373   unsigned int aom_highbd_10_obmc_variance##W##x##H##_c(                   \
1374       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1375       const int32_t *mask, unsigned int *sse) {                            \
1376     int sum;                                                               \
1377     int64_t var;                                                           \
1378     highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1379     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
1380     return (var >= 0) ? (uint32_t)var : 0;                                 \
1381   }                                                                        \
1382                                                                            \
1383   unsigned int aom_highbd_12_obmc_variance##W##x##H##_c(                   \
1384       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1385       const int32_t *mask, unsigned int *sse) {                            \
1386     int sum;                                                               \
1387     int64_t var;                                                           \
1388     highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1389     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
1390     return (var >= 0) ? (uint32_t)var : 0;                                 \
1391   }
1392 
1393 #define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                           \
1394   unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c(                \
1395       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1396       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1397     uint16_t fdata3[(H + 1) * W];                                              \
1398     uint16_t temp2[H * W];                                                     \
1399                                                                                \
1400     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1401         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1402     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1403         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1404                                                                                \
1405     return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
1406                                                  wsrc, mask, sse);             \
1407   }                                                                            \
1408                                                                                \
1409   unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(             \
1410       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1411       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1412     uint16_t fdata3[(H + 1) * W];                                              \
1413     uint16_t temp2[H * W];                                                     \
1414                                                                                \
1415     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1416         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1417     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1418         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1419                                                                                \
1420     return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1421                                                     W, wsrc, mask, sse);       \
1422   }                                                                            \
1423                                                                                \
1424   unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c(             \
1425       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1426       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1427     uint16_t fdata3[(H + 1) * W];                                              \
1428     uint16_t temp2[H * W];                                                     \
1429                                                                                \
1430     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1431         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1432     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1433         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1434                                                                                \
1435     return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1436                                                     W, wsrc, mask, sse);       \
1437   }
1438 
1439 HIGHBD_OBMC_VAR(4, 4)
1440 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
1441 
1442 HIGHBD_OBMC_VAR(4, 8)
1443 HIGHBD_OBMC_SUBPIX_VAR(4, 8)
1444 
1445 HIGHBD_OBMC_VAR(8, 4)
1446 HIGHBD_OBMC_SUBPIX_VAR(8, 4)
1447 
1448 HIGHBD_OBMC_VAR(8, 8)
1449 HIGHBD_OBMC_SUBPIX_VAR(8, 8)
1450 
1451 HIGHBD_OBMC_VAR(8, 16)
1452 HIGHBD_OBMC_SUBPIX_VAR(8, 16)
1453 
1454 HIGHBD_OBMC_VAR(16, 8)
1455 HIGHBD_OBMC_SUBPIX_VAR(16, 8)
1456 
1457 HIGHBD_OBMC_VAR(16, 16)
1458 HIGHBD_OBMC_SUBPIX_VAR(16, 16)
1459 
1460 HIGHBD_OBMC_VAR(16, 32)
1461 HIGHBD_OBMC_SUBPIX_VAR(16, 32)
1462 
1463 HIGHBD_OBMC_VAR(32, 16)
1464 HIGHBD_OBMC_SUBPIX_VAR(32, 16)
1465 
1466 HIGHBD_OBMC_VAR(32, 32)
1467 HIGHBD_OBMC_SUBPIX_VAR(32, 32)
1468 
1469 HIGHBD_OBMC_VAR(32, 64)
1470 HIGHBD_OBMC_SUBPIX_VAR(32, 64)
1471 
1472 HIGHBD_OBMC_VAR(64, 32)
1473 HIGHBD_OBMC_SUBPIX_VAR(64, 32)
1474 
1475 HIGHBD_OBMC_VAR(64, 64)
1476 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
1477 
1478 HIGHBD_OBMC_VAR(64, 128)
1479 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
1480 
1481 HIGHBD_OBMC_VAR(128, 64)
1482 HIGHBD_OBMC_SUBPIX_VAR(128, 64)
1483 
1484 HIGHBD_OBMC_VAR(128, 128)
1485 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
1486 
1487 HIGHBD_OBMC_VAR(4, 16)
1488 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
1489 HIGHBD_OBMC_VAR(16, 4)
1490 HIGHBD_OBMC_SUBPIX_VAR(16, 4)
1491 HIGHBD_OBMC_VAR(8, 32)
1492 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
1493 HIGHBD_OBMC_VAR(32, 8)
1494 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
1495 HIGHBD_OBMC_VAR(16, 64)
1496 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
1497 HIGHBD_OBMC_VAR(64, 16)
1498 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
1499 #endif  // CONFIG_AV1_HIGHBITDEPTH
1500 #endif  // !CONFIG_REALTIME_ONLY
1501 
aom_mse_wxh_16bit_c(uint8_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1502 uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
1503                              int sstride, int w, int h) {
1504   uint64_t sum = 0;
1505   for (int i = 0; i < h; i++) {
1506     for (int j = 0; j < w; j++) {
1507       int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
1508       sum += e * e;
1509     }
1510   }
1511   return sum;
1512 }
1513 
aom_mse_wxh_16bit_highbd_c(uint16_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1514 uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
1515                                     int sstride, int w, int h) {
1516   uint64_t sum = 0;
1517   for (int i = 0; i < h; i++) {
1518     for (int j = 0; j < w; j++) {
1519       int e = dst[i * dstride + j] - src[i * sstride + j];
1520       sum += e * e;
1521     }
1522   }
1523   return sum;
1524 }
1525