1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <stdlib.h>
12 #include <string.h>
13 #include <assert.h>
14 
15 #include "./aom_config.h"
16 #include "./aom_dsp_rtcd.h"
17 
18 #include "aom_ports/mem.h"
19 #include "aom/aom_integer.h"
20 
21 #include "aom_dsp/variance.h"
22 #include "aom_dsp/aom_filter.h"
23 #include "aom_dsp/blend.h"
24 
25 #include "./av1_rtcd.h"
26 #include "av1/common/filter.h"
27 
aom_get4x4sse_cs_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride)28 uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
29                             int b_stride) {
30   int distortion = 0;
31   int r, c;
32 
33   for (r = 0; r < 4; ++r) {
34     for (c = 0; c < 4; ++c) {
35       int diff = a[c] - b[c];
36       distortion += diff * diff;
37     }
38 
39     a += a_stride;
40     b += b_stride;
41   }
42 
43   return distortion;
44 }
45 
aom_get_mb_ss_c(const int16_t * a)46 uint32_t aom_get_mb_ss_c(const int16_t *a) {
47   unsigned int i, sum = 0;
48 
49   for (i = 0; i < 256; ++i) {
50     sum += a[i] * a[i];
51   }
52 
53   return sum;
54 }
55 
aom_variance_halfpixvar16x16_h_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,uint32_t * sse)56 uint32_t aom_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,
57                                           const uint8_t *b, int b_stride,
58                                           uint32_t *sse) {
59   return aom_sub_pixel_variance16x16_c(a, a_stride, 4, 0, b, b_stride, sse);
60 }
61 
aom_variance_halfpixvar16x16_v_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,uint32_t * sse)62 uint32_t aom_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,
63                                           const uint8_t *b, int b_stride,
64                                           uint32_t *sse) {
65   return aom_sub_pixel_variance16x16_c(a, a_stride, 0, 4, b, b_stride, sse);
66 }
67 
aom_variance_halfpixvar16x16_hv_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,uint32_t * sse)68 uint32_t aom_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,
69                                            const uint8_t *b, int b_stride,
70                                            uint32_t *sse) {
71   return aom_sub_pixel_variance16x16_c(a, a_stride, 4, 4, b, b_stride, sse);
72 }
73 
variance(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h,uint32_t * sse,int * sum)74 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
75                      int b_stride, int w, int h, uint32_t *sse, int *sum) {
76   int i, j;
77 
78   *sum = 0;
79   *sse = 0;
80 
81   for (i = 0; i < h; ++i) {
82     for (j = 0; j < w; ++j) {
83       const int diff = a[j] - b[j];
84       *sum += diff;
85       *sse += diff * diff;
86     }
87 
88     a += a_stride;
89     b += b_stride;
90   }
91 }
92 
aom_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)93 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
94                           int b_stride, int w, int h) {
95   uint32_t sse;
96   int sum;
97   variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
98   return sse;
99 }
100 
101 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
102 // or vertical direction to produce the filtered output block. Used to implement
103 // the first-pass of 2-D separable filter.
104 //
105 // Produces int16_t output to retain precision for the next pass. Two filter
106 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
107 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
108 // It defines the offset required to move from one input to the next.
var_filter_block2d_bil_first_pass(const uint8_t * a,uint16_t * b,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)109 static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
110                                               unsigned int src_pixels_per_line,
111                                               int pixel_step,
112                                               unsigned int output_height,
113                                               unsigned int output_width,
114                                               const uint8_t *filter) {
115   unsigned int i, j;
116 
117   for (i = 0; i < output_height; ++i) {
118     for (j = 0; j < output_width; ++j) {
119       b[j] = ROUND_POWER_OF_TWO(
120           (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
121 
122       ++a;
123     }
124 
125     a += src_pixels_per_line - output_width;
126     b += output_width;
127   }
128 }
129 
130 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
131 // or vertical direction to produce the filtered output block. Used to implement
132 // the second-pass of 2-D separable filter.
133 //
134 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
135 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
136 // filter is applied horizontally (pixel_step = 1) or vertically
137 // (pixel_step = stride). It defines the offset required to move from one input
138 // to the next. Output is 8-bit.
var_filter_block2d_bil_second_pass(const uint16_t * a,uint8_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)139 static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
140                                                unsigned int src_pixels_per_line,
141                                                unsigned int pixel_step,
142                                                unsigned int output_height,
143                                                unsigned int output_width,
144                                                const uint8_t *filter) {
145   unsigned int i, j;
146 
147   for (i = 0; i < output_height; ++i) {
148     for (j = 0; j < output_width; ++j) {
149       b[j] = ROUND_POWER_OF_TWO(
150           (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
151       ++a;
152     }
153 
154     a += src_pixels_per_line - output_width;
155     b += output_width;
156   }
157 }
158 
159 #define VAR(W, H)                                                    \
160   uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
161                                      const uint8_t *b, int b_stride, \
162                                      uint32_t *sse) {                \
163     int sum;                                                         \
164     variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
165     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
166   }
167 
168 #define SUBPIX_VAR(W, H)                                                \
169   uint32_t aom_sub_pixel_variance##W##x##H##_c(                         \
170       const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
171       const uint8_t *b, int b_stride, uint32_t *sse) {                  \
172     uint16_t fdata3[(H + 1) * W];                                       \
173     uint8_t temp2[H * W];                                               \
174                                                                         \
175     var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
176                                       bilinear_filters_2t[xoffset]);    \
177     var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
178                                        bilinear_filters_2t[yoffset]);   \
179                                                                         \
180     return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);       \
181   }
182 
183 #define SUBPIX_AVG_VAR(W, H)                                            \
184   uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                     \
185       const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
186       const uint8_t *b, int b_stride, uint32_t *sse,                    \
187       const uint8_t *second_pred) {                                     \
188     uint16_t fdata3[(H + 1) * W];                                       \
189     uint8_t temp2[H * W];                                               \
190     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                         \
191                                                                         \
192     var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
193                                       bilinear_filters_2t[xoffset]);    \
194     var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
195                                        bilinear_filters_2t[yoffset]);   \
196                                                                         \
197     aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);              \
198                                                                         \
199     return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);       \
200   }
201 
202 /* Identical to the variance call except it takes an additional parameter, sum,
203  * and returns that value using pass-by-reference instead of returning
204  * sse - sum^2 / w*h
205  */
206 #define GET_VAR(W, H)                                                         \
207   void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride,                \
208                                const uint8_t *b, int b_stride, uint32_t *sse, \
209                                int *sum) {                                    \
210     variance(a, a_stride, b, b_stride, W, H, sse, sum);                       \
211   }
212 
213 /* Identical to the variance call except it does not calculate the
214  * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
215  * variable.
216  */
217 #define MSE(W, H)                                               \
218   uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
219                                 const uint8_t *b, int b_stride, \
220                                 uint32_t *sse) {                \
221     int sum;                                                    \
222     variance(a, a_stride, b, b_stride, W, H, sse, &sum);        \
223     return *sse;                                                \
224   }
225 
226 /* All three forms of the variance are available in the same sizes. */
227 #define VARIANCES(W, H) \
228   VAR(W, H)             \
229   SUBPIX_VAR(W, H)      \
230   SUBPIX_AVG_VAR(W, H)
231 
232 #if CONFIG_AV1 && CONFIG_EXT_PARTITION
233 VARIANCES(128, 128)
234 VARIANCES(128, 64)
235 VARIANCES(64, 128)
236 #endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
237 VARIANCES(64, 64)
238 VARIANCES(64, 32)
239 VARIANCES(32, 64)
240 VARIANCES(32, 32)
241 VARIANCES(32, 16)
242 VARIANCES(16, 32)
243 VARIANCES(16, 16)
244 VARIANCES(16, 8)
245 VARIANCES(8, 16)
246 VARIANCES(8, 8)
247 VARIANCES(8, 4)
248 VARIANCES(4, 8)
249 VARIANCES(4, 4)
250 VARIANCES(4, 2)
251 VARIANCES(2, 4)
252 VARIANCES(2, 2)
253 
254 #if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
255 VARIANCES(4, 16)
256 VARIANCES(16, 4)
257 VARIANCES(8, 32)
258 VARIANCES(32, 8)
259 VARIANCES(16, 64)
260 VARIANCES(64, 16)
261 #if CONFIG_EXT_PARTITION
262 VARIANCES(32, 128)
263 VARIANCES(128, 32)
264 #endif  // CONFIG_EXT_PARTITION
265 #endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
266 
267 GET_VAR(16, 16)
268 GET_VAR(8, 8)
269 
270 MSE(16, 16)
271 MSE(16, 8)
272 MSE(8, 16)
273 MSE(8, 8)
274 
aom_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride)275 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
276                          int height, const uint8_t *ref, int ref_stride) {
277   int i, j;
278 
279   for (i = 0; i < height; ++i) {
280     for (j = 0; j < width; ++j) {
281       const int tmp = pred[j] + ref[j];
282       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
283     }
284     comp_pred += width;
285     pred += width;
286     ref += ref_stride;
287   }
288 }
289 
290 // Get pred block from up-sampled reference.
aom_upsampled_pred_c(uint8_t * comp_pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride)291 void aom_upsampled_pred_c(uint8_t *comp_pred, int width, int height,
292                           int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
293                           int ref_stride) {
294   if (!subpel_x_q3 && !subpel_y_q3) {
295     int i;
296     for (i = 0; i < height; i++) {
297       memcpy(comp_pred, ref, width * sizeof(*comp_pred));
298       comp_pred += width;
299       ref += ref_stride;
300     }
301   } else {
302     InterpFilterParams filter;
303     filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
304     if (!subpel_y_q3) {
305       const int16_t *kernel;
306       kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
307       /*Directly call C version to allow this to work for small (2x2) sizes.*/
308       aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
309                             -1, width, height);
310     } else if (!subpel_x_q3) {
311       const int16_t *kernel;
312       kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
313       /*Directly call C version to allow this to work for small (2x2) sizes.*/
314       aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
315                            16, width, height);
316     } else {
317       DECLARE_ALIGNED(16, uint8_t,
318                       temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
319       const int16_t *kernel_x;
320       const int16_t *kernel_y;
321       int intermediate_height;
322       kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
323       kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
324       intermediate_height =
325           (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
326       assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
327       /*Directly call C versions to allow this to work for small (2x2) sizes.*/
328       aom_convolve8_horiz_c(ref - ref_stride * ((filter.taps >> 1) - 1),
329                             ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL,
330                             -1, width, intermediate_height);
331       aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
332                            MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y,
333                            16, width, height);
334     }
335   }
336 }
337 
aom_comp_avg_upsampled_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride)338 void aom_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
339                                    int width, int height, int subpel_x_q3,
340                                    int subpel_y_q3, const uint8_t *ref,
341                                    int ref_stride) {
342   int i, j;
343 
344   aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
345                      ref_stride);
346   for (i = 0; i < height; i++) {
347     for (j = 0; j < width; j++) {
348       comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
349     }
350     comp_pred += width;
351     pred += width;
352   }
353 }
354 
355 #if CONFIG_HIGHBITDEPTH
highbd_variance64(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint64_t * sse,int64_t * sum)356 static void highbd_variance64(const uint8_t *a8, int a_stride,
357                               const uint8_t *b8, int b_stride, int w, int h,
358                               uint64_t *sse, int64_t *sum) {
359   int i, j;
360 
361   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
362   uint16_t *b = CONVERT_TO_SHORTPTR(b8);
363   *sum = 0;
364   *sse = 0;
365 
366   for (i = 0; i < h; ++i) {
367     for (j = 0; j < w; ++j) {
368       const int diff = a[j] - b[j];
369       *sum += diff;
370       *sse += diff * diff;
371     }
372     a += a_stride;
373     b += b_stride;
374   }
375 }
376 
aom_highbd_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)377 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
378                                  const uint8_t *b, int b_stride, int w, int h) {
379   uint64_t sse;
380   int64_t sum;
381   highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
382   return sse;
383 }
384 
highbd_8_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)385 static void highbd_8_variance(const uint8_t *a8, int a_stride,
386                               const uint8_t *b8, int b_stride, int w, int h,
387                               uint32_t *sse, int *sum) {
388   uint64_t sse_long = 0;
389   int64_t sum_long = 0;
390   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
391   *sse = (uint32_t)sse_long;
392   *sum = (int)sum_long;
393 }
394 
highbd_10_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)395 static void highbd_10_variance(const uint8_t *a8, int a_stride,
396                                const uint8_t *b8, int b_stride, int w, int h,
397                                uint32_t *sse, int *sum) {
398   uint64_t sse_long = 0;
399   int64_t sum_long = 0;
400   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
401   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
402   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
403 }
404 
highbd_12_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)405 static void highbd_12_variance(const uint8_t *a8, int a_stride,
406                                const uint8_t *b8, int b_stride, int w, int h,
407                                uint32_t *sse, int *sum) {
408   uint64_t sse_long = 0;
409   int64_t sum_long = 0;
410   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
411   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
412   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
413 }
414 
415 #define HIGHBD_VAR(W, H)                                                       \
416   uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride,  \
417                                               const uint8_t *b, int b_stride,  \
418                                               uint32_t *sse) {                 \
419     int sum;                                                                   \
420     highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum);              \
421     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));                  \
422   }                                                                            \
423                                                                                \
424   uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
425                                                const uint8_t *b, int b_stride, \
426                                                uint32_t *sse) {                \
427     int sum;                                                                   \
428     int64_t var;                                                               \
429     highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
430     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
431     return (var >= 0) ? (uint32_t)var : 0;                                     \
432   }                                                                            \
433                                                                                \
434   uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
435                                                const uint8_t *b, int b_stride, \
436                                                uint32_t *sse) {                \
437     int sum;                                                                   \
438     int64_t var;                                                               \
439     highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
440     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
441     return (var >= 0) ? (uint32_t)var : 0;                                     \
442   }
443 
444 #define HIGHBD_GET_VAR(S)                                                    \
445   void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride,  \
446                                         const uint8_t *ref, int ref_stride,  \
447                                         uint32_t *sse, int *sum) {           \
448     highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);     \
449   }                                                                          \
450                                                                              \
451   void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
452                                          const uint8_t *ref, int ref_stride, \
453                                          uint32_t *sse, int *sum) {          \
454     highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
455   }                                                                          \
456                                                                              \
457   void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
458                                          const uint8_t *ref, int ref_stride, \
459                                          uint32_t *sse, int *sum) {          \
460     highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
461   }
462 
463 #define HIGHBD_MSE(W, H)                                                      \
464   uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride,  \
465                                          const uint8_t *ref, int ref_stride,  \
466                                          uint32_t *sse) {                     \
467     int sum;                                                                  \
468     highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);     \
469     return *sse;                                                              \
470   }                                                                           \
471                                                                               \
472   uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
473                                           const uint8_t *ref, int ref_stride, \
474                                           uint32_t *sse) {                    \
475     int sum;                                                                  \
476     highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
477     return *sse;                                                              \
478   }                                                                           \
479                                                                               \
480   uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
481                                           const uint8_t *ref, int ref_stride, \
482                                           uint32_t *sse) {                    \
483     int sum;                                                                  \
484     highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
485     return *sse;                                                              \
486   }
487 
aom_highbd_var_filter_block2d_bil_first_pass(const uint8_t * src_ptr8,uint16_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)488 void aom_highbd_var_filter_block2d_bil_first_pass(
489     const uint8_t *src_ptr8, uint16_t *output_ptr,
490     unsigned int src_pixels_per_line, int pixel_step,
491     unsigned int output_height, unsigned int output_width,
492     const uint8_t *filter) {
493   unsigned int i, j;
494   uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
495   for (i = 0; i < output_height; ++i) {
496     for (j = 0; j < output_width; ++j) {
497       output_ptr[j] = ROUND_POWER_OF_TWO(
498           (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
499           FILTER_BITS);
500 
501       ++src_ptr;
502     }
503 
504     // Next row...
505     src_ptr += src_pixels_per_line - output_width;
506     output_ptr += output_width;
507   }
508 }
509 
aom_highbd_var_filter_block2d_bil_second_pass(const uint16_t * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)510 void aom_highbd_var_filter_block2d_bil_second_pass(
511     const uint16_t *src_ptr, uint16_t *output_ptr,
512     unsigned int src_pixels_per_line, unsigned int pixel_step,
513     unsigned int output_height, unsigned int output_width,
514     const uint8_t *filter) {
515   unsigned int i, j;
516 
517   for (i = 0; i < output_height; ++i) {
518     for (j = 0; j < output_width; ++j) {
519       output_ptr[j] = ROUND_POWER_OF_TWO(
520           (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
521           FILTER_BITS);
522       ++src_ptr;
523     }
524 
525     src_ptr += src_pixels_per_line - output_width;
526     output_ptr += output_width;
527   }
528 }
529 
530 #define HIGHBD_SUBPIX_VAR(W, H)                                              \
531   uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c(                     \
532       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
533       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
534     uint16_t fdata3[(H + 1) * W];                                            \
535     uint16_t temp2[H * W];                                                   \
536                                                                              \
537     aom_highbd_var_filter_block2d_bil_first_pass(                            \
538         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
539     aom_highbd_var_filter_block2d_bil_second_pass(                           \
540         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
541                                                                              \
542     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
543                                               dst, dst_stride, sse);         \
544   }                                                                          \
545                                                                              \
546   uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c(                    \
547       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
548       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
549     uint16_t fdata3[(H + 1) * W];                                            \
550     uint16_t temp2[H * W];                                                   \
551                                                                              \
552     aom_highbd_var_filter_block2d_bil_first_pass(                            \
553         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
554     aom_highbd_var_filter_block2d_bil_second_pass(                           \
555         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
556                                                                              \
557     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
558                                                dst, dst_stride, sse);        \
559   }                                                                          \
560                                                                              \
561   uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c(                    \
562       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
563       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
564     uint16_t fdata3[(H + 1) * W];                                            \
565     uint16_t temp2[H * W];                                                   \
566                                                                              \
567     aom_highbd_var_filter_block2d_bil_first_pass(                            \
568         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
569     aom_highbd_var_filter_block2d_bil_second_pass(                           \
570         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
571                                                                              \
572     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
573                                                dst, dst_stride, sse);        \
574   }
575 
576 #define HIGHBD_SUBPIX_AVG_VAR(W, H)                                          \
577   uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                 \
578       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
579       const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
580       const uint8_t *second_pred) {                                          \
581     uint16_t fdata3[(H + 1) * W];                                            \
582     uint16_t temp2[H * W];                                                   \
583     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
584                                                                              \
585     aom_highbd_var_filter_block2d_bil_first_pass(                            \
586         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
587     aom_highbd_var_filter_block2d_bil_second_pass(                           \
588         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
589                                                                              \
590     aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
591                                CONVERT_TO_BYTEPTR(temp2), W);                \
592                                                                              \
593     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
594                                               dst, dst_stride, sse);         \
595   }                                                                          \
596                                                                              \
597   uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                \
598       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
599       const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
600       const uint8_t *second_pred) {                                          \
601     uint16_t fdata3[(H + 1) * W];                                            \
602     uint16_t temp2[H * W];                                                   \
603     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
604                                                                              \
605     aom_highbd_var_filter_block2d_bil_first_pass(                            \
606         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
607     aom_highbd_var_filter_block2d_bil_second_pass(                           \
608         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
609                                                                              \
610     aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
611                                CONVERT_TO_BYTEPTR(temp2), W);                \
612                                                                              \
613     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
614                                                dst, dst_stride, sse);        \
615   }                                                                          \
616                                                                              \
617   uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                \
618       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
619       const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
620       const uint8_t *second_pred) {                                          \
621     uint16_t fdata3[(H + 1) * W];                                            \
622     uint16_t temp2[H * W];                                                   \
623     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
624                                                                              \
625     aom_highbd_var_filter_block2d_bil_first_pass(                            \
626         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
627     aom_highbd_var_filter_block2d_bil_second_pass(                           \
628         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
629                                                                              \
630     aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
631                                CONVERT_TO_BYTEPTR(temp2), W);                \
632                                                                              \
633     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
634                                                dst, dst_stride, sse);        \
635   }
636 
637 /* All three forms of the variance are available in the same sizes. */
638 #define HIGHBD_VARIANCES(W, H) \
639   HIGHBD_VAR(W, H)             \
640   HIGHBD_SUBPIX_VAR(W, H)      \
641   HIGHBD_SUBPIX_AVG_VAR(W, H)
642 
643 #if CONFIG_AV1 && CONFIG_EXT_PARTITION
644 HIGHBD_VARIANCES(128, 128)
645 HIGHBD_VARIANCES(128, 64)
646 HIGHBD_VARIANCES(64, 128)
647 #endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
648 HIGHBD_VARIANCES(64, 64)
649 HIGHBD_VARIANCES(64, 32)
650 HIGHBD_VARIANCES(32, 64)
651 HIGHBD_VARIANCES(32, 32)
652 HIGHBD_VARIANCES(32, 16)
653 HIGHBD_VARIANCES(16, 32)
654 HIGHBD_VARIANCES(16, 16)
655 HIGHBD_VARIANCES(16, 8)
656 HIGHBD_VARIANCES(8, 16)
657 HIGHBD_VARIANCES(8, 8)
658 HIGHBD_VARIANCES(8, 4)
659 HIGHBD_VARIANCES(4, 8)
660 HIGHBD_VARIANCES(4, 4)
661 HIGHBD_VARIANCES(4, 2)
662 HIGHBD_VARIANCES(2, 4)
663 HIGHBD_VARIANCES(2, 2)
664 
665 #if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
666 HIGHBD_VARIANCES(4, 16)
667 HIGHBD_VARIANCES(16, 4)
668 HIGHBD_VARIANCES(8, 32)
669 HIGHBD_VARIANCES(32, 8)
670 HIGHBD_VARIANCES(16, 64)
671 HIGHBD_VARIANCES(64, 16)
672 #if CONFIG_EXT_PARTITION
673 HIGHBD_VARIANCES(32, 128)
674 HIGHBD_VARIANCES(128, 32)
675 #endif  // CONFIG_EXT_PARTITION
676 #endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
677 
678 HIGHBD_GET_VAR(8)
679 HIGHBD_GET_VAR(16)
680 
681 HIGHBD_MSE(16, 16)
682 HIGHBD_MSE(16, 8)
683 HIGHBD_MSE(8, 16)
684 HIGHBD_MSE(8, 8)
685 
aom_highbd_comp_avg_pred_c(uint16_t * comp_pred,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride)686 void aom_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
687                                 int width, int height, const uint8_t *ref8,
688                                 int ref_stride) {
689   int i, j;
690   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
691   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
692   for (i = 0; i < height; ++i) {
693     for (j = 0; j < width; ++j) {
694       const int tmp = pred[j] + ref[j];
695       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
696     }
697     comp_pred += width;
698     pred += width;
699     ref += ref_stride;
700   }
701 }
702 
aom_highbd_upsampled_pred_c(uint16_t * comp_pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd)703 void aom_highbd_upsampled_pred_c(uint16_t *comp_pred, int width, int height,
704                                  int subpel_x_q3, int subpel_y_q3,
705                                  const uint8_t *ref8, int ref_stride, int bd) {
706   if (!subpel_x_q3 && !subpel_y_q3) {
707     const uint16_t *ref;
708     int i;
709     ref = CONVERT_TO_SHORTPTR(ref8);
710     for (i = 0; i < height; i++) {
711       memcpy(comp_pred, ref, width * sizeof(*comp_pred));
712       comp_pred += width;
713       ref += ref_stride;
714     }
715   } else {
716     InterpFilterParams filter;
717     filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
718     if (!subpel_y_q3) {
719       const int16_t *kernel;
720       kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
721       /*Directly call C version to allow this to work for small (2x2) sizes.*/
722       aom_highbd_convolve8_horiz_c(ref8, ref_stride,
723                                    CONVERT_TO_BYTEPTR(comp_pred), width, kernel,
724                                    16, NULL, -1, width, height, bd);
725     } else if (!subpel_x_q3) {
726       const int16_t *kernel;
727       kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
728       /*Directly call C version to allow this to work for small (2x2) sizes.*/
729       aom_highbd_convolve8_vert_c(ref8, ref_stride,
730                                   CONVERT_TO_BYTEPTR(comp_pred), width, NULL,
731                                   -1, kernel, 16, width, height, bd);
732     } else {
733       DECLARE_ALIGNED(16, uint16_t,
734                       temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
735       const int16_t *kernel_x;
736       const int16_t *kernel_y;
737       int intermediate_height;
738       kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
739       kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
740       intermediate_height =
741           (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
742       assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
743       /*Directly call C versions to allow this to work for small (2x2) sizes.*/
744       aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter.taps >> 1) - 1),
745                                    ref_stride, CONVERT_TO_BYTEPTR(temp),
746                                    MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
747                                    intermediate_height, bd);
748       aom_highbd_convolve8_vert_c(
749           CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
750           MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
751           16, width, height, bd);
752     }
753   }
754 }
755 
aom_highbd_comp_avg_upsampled_pred_c(uint16_t * comp_pred,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd)756 void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred,
757                                           const uint8_t *pred8, int width,
758                                           int height, int subpel_x_q3,
759                                           int subpel_y_q3, const uint8_t *ref8,
760                                           int ref_stride, int bd) {
761   int i, j;
762 
763   const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
764   aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3,
765                             ref8, ref_stride, bd);
766   for (i = 0; i < height; ++i) {
767     for (j = 0; j < width; ++j) {
768       comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
769     }
770     comp_pred += width;
771     pred += width;
772   }
773 }
774 #endif  // CONFIG_HIGHBITDEPTH
775 
776 #if CONFIG_AV1
aom_comp_mask_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)777 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
778                           int height, const uint8_t *ref, int ref_stride,
779                           const uint8_t *mask, int mask_stride,
780                           int invert_mask) {
781   int i, j;
782 
783   for (i = 0; i < height; ++i) {
784     for (j = 0; j < width; ++j) {
785       if (!invert_mask)
786         comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
787       else
788         comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
789     }
790     comp_pred += width;
791     pred += width;
792     ref += ref_stride;
793     mask += mask_stride;
794   }
795 }
796 
aom_comp_mask_upsampled_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)797 void aom_comp_mask_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
798                                     int width, int height, int subpel_x_q3,
799                                     int subpel_y_q3, const uint8_t *ref,
800                                     int ref_stride, const uint8_t *mask,
801                                     int mask_stride, int invert_mask) {
802   int i, j;
803 
804   aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
805                      ref_stride);
806   for (i = 0; i < height; i++) {
807     for (j = 0; j < width; j++) {
808       if (!invert_mask)
809         comp_pred[j] = AOM_BLEND_A64(mask[j], comp_pred[j], pred[j]);
810       else
811         comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], comp_pred[j]);
812     }
813     comp_pred += width;
814     pred += width;
815     mask += mask_stride;
816   }
817 }
818 
819 #define MASK_SUBPIX_VAR(W, H)                                                 \
820   unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                    \
821       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
822       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
823       const uint8_t *msk, int msk_stride, int invert_mask,                    \
824       unsigned int *sse) {                                                    \
825     uint16_t fdata3[(H + 1) * W];                                             \
826     uint8_t temp2[H * W];                                                     \
827     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                               \
828                                                                               \
829     var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W,   \
830                                       bilinear_filters_2t[xoffset]);          \
831     var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,             \
832                                        bilinear_filters_2t[yoffset]);         \
833                                                                               \
834     aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
835                          invert_mask);                                        \
836     return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);         \
837   }
838 
839 MASK_SUBPIX_VAR(4, 4)
840 MASK_SUBPIX_VAR(4, 8)
841 MASK_SUBPIX_VAR(8, 4)
842 MASK_SUBPIX_VAR(8, 8)
843 MASK_SUBPIX_VAR(8, 16)
844 MASK_SUBPIX_VAR(16, 8)
845 MASK_SUBPIX_VAR(16, 16)
846 MASK_SUBPIX_VAR(16, 32)
847 MASK_SUBPIX_VAR(32, 16)
848 MASK_SUBPIX_VAR(32, 32)
849 MASK_SUBPIX_VAR(32, 64)
850 MASK_SUBPIX_VAR(64, 32)
851 MASK_SUBPIX_VAR(64, 64)
852 #if CONFIG_EXT_PARTITION
853 MASK_SUBPIX_VAR(64, 128)
854 MASK_SUBPIX_VAR(128, 64)
855 MASK_SUBPIX_VAR(128, 128)
856 #endif  // CONFIG_EXT_PARTITION
857 
858 #if CONFIG_EXT_PARTITION_TYPES
859 MASK_SUBPIX_VAR(4, 16)
860 MASK_SUBPIX_VAR(16, 4)
861 MASK_SUBPIX_VAR(8, 32)
862 MASK_SUBPIX_VAR(32, 8)
863 MASK_SUBPIX_VAR(16, 64)
864 MASK_SUBPIX_VAR(64, 16)
865 #if CONFIG_EXT_PARTITION
866 MASK_SUBPIX_VAR(32, 128)
867 MASK_SUBPIX_VAR(128, 32)
868 #endif  // CONFIG_EXT_PARTITION
869 #endif  // CONFIG_EXT_PARTITION_TYPES
870 
871 #if CONFIG_HIGHBITDEPTH
aom_highbd_comp_mask_pred_c(uint16_t * comp_pred,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)872 void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
873                                  int width, int height, const uint8_t *ref8,
874                                  int ref_stride, const uint8_t *mask,
875                                  int mask_stride, int invert_mask) {
876   int i, j;
877   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
878   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
879   for (i = 0; i < height; ++i) {
880     for (j = 0; j < width; ++j) {
881       if (!invert_mask)
882         comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
883       else
884         comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
885     }
886     comp_pred += width;
887     pred += width;
888     ref += ref_stride;
889     mask += mask_stride;
890   }
891 }
892 
aom_highbd_comp_mask_upsampled_pred_c(uint16_t * comp_pred,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask,int bd)893 void aom_highbd_comp_mask_upsampled_pred_c(
894     uint16_t *comp_pred, const uint8_t *pred8, int width, int height,
895     int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride,
896     const uint8_t *mask, int mask_stride, int invert_mask, int bd) {
897   int i, j;
898 
899   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
900   aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3,
901                             ref8, ref_stride, bd);
902   for (i = 0; i < height; ++i) {
903     for (j = 0; j < width; ++j) {
904       if (!invert_mask)
905         comp_pred[j] = AOM_BLEND_A64(mask[j], comp_pred[j], pred[j]);
906       else
907         comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], comp_pred[j]);
908     }
909     comp_pred += width;
910     pred += width;
911     mask += mask_stride;
912   }
913 }
914 
915 #define HIGHBD_MASK_SUBPIX_VAR(W, H)                                           \
916   unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c(            \
917       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
918       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
919       const uint8_t *msk, int msk_stride, int invert_mask,                     \
920       unsigned int *sse) {                                                     \
921     uint16_t fdata3[(H + 1) * W];                                              \
922     uint16_t temp2[H * W];                                                     \
923     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
924                                                                                \
925     aom_highbd_var_filter_block2d_bil_first_pass(                              \
926         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
927     aom_highbd_var_filter_block2d_bil_second_pass(                             \
928         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
929                                                                                \
930     aom_highbd_comp_mask_pred_c(temp3, second_pred, W, H,                      \
931                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
932                                 invert_mask);                                  \
933                                                                                \
934     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,    \
935                                               ref, ref_stride, sse);           \
936   }                                                                            \
937                                                                                \
938   unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c(           \
939       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
940       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
941       const uint8_t *msk, int msk_stride, int invert_mask,                     \
942       unsigned int *sse) {                                                     \
943     uint16_t fdata3[(H + 1) * W];                                              \
944     uint16_t temp2[H * W];                                                     \
945     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
946                                                                                \
947     aom_highbd_var_filter_block2d_bil_first_pass(                              \
948         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
949     aom_highbd_var_filter_block2d_bil_second_pass(                             \
950         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
951                                                                                \
952     aom_highbd_comp_mask_pred_c(temp3, second_pred, W, H,                      \
953                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
954                                 invert_mask);                                  \
955                                                                                \
956     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
957                                                ref, ref_stride, sse);          \
958   }                                                                            \
959                                                                                \
960   unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c(           \
961       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
962       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
963       const uint8_t *msk, int msk_stride, int invert_mask,                     \
964       unsigned int *sse) {                                                     \
965     uint16_t fdata3[(H + 1) * W];                                              \
966     uint16_t temp2[H * W];                                                     \
967     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
968                                                                                \
969     aom_highbd_var_filter_block2d_bil_first_pass(                              \
970         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
971     aom_highbd_var_filter_block2d_bil_second_pass(                             \
972         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
973                                                                                \
974     aom_highbd_comp_mask_pred_c(temp3, second_pred, W, H,                      \
975                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
976                                 invert_mask);                                  \
977                                                                                \
978     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
979                                                ref, ref_stride, sse);          \
980   }
981 
982 HIGHBD_MASK_SUBPIX_VAR(4, 4)
983 HIGHBD_MASK_SUBPIX_VAR(4, 8)
984 HIGHBD_MASK_SUBPIX_VAR(8, 4)
985 HIGHBD_MASK_SUBPIX_VAR(8, 8)
986 HIGHBD_MASK_SUBPIX_VAR(8, 16)
987 HIGHBD_MASK_SUBPIX_VAR(16, 8)
988 HIGHBD_MASK_SUBPIX_VAR(16, 16)
989 HIGHBD_MASK_SUBPIX_VAR(16, 32)
990 HIGHBD_MASK_SUBPIX_VAR(32, 16)
991 HIGHBD_MASK_SUBPIX_VAR(32, 32)
992 HIGHBD_MASK_SUBPIX_VAR(32, 64)
993 HIGHBD_MASK_SUBPIX_VAR(64, 32)
994 HIGHBD_MASK_SUBPIX_VAR(64, 64)
995 #if CONFIG_EXT_PARTITION
996 HIGHBD_MASK_SUBPIX_VAR(64, 128)
997 HIGHBD_MASK_SUBPIX_VAR(128, 64)
998 HIGHBD_MASK_SUBPIX_VAR(128, 128)
999 #endif  // CONFIG_EXT_PARTITION
1000 
1001 #if CONFIG_EXT_PARTITION_TYPES
1002 HIGHBD_MASK_SUBPIX_VAR(4, 16)
1003 HIGHBD_MASK_SUBPIX_VAR(16, 4)
1004 HIGHBD_MASK_SUBPIX_VAR(8, 32)
1005 HIGHBD_MASK_SUBPIX_VAR(32, 8)
1006 HIGHBD_MASK_SUBPIX_VAR(16, 64)
1007 HIGHBD_MASK_SUBPIX_VAR(64, 16)
1008 #if CONFIG_EXT_PARTITION
1009 HIGHBD_MASK_SUBPIX_VAR(32, 128)
1010 HIGHBD_MASK_SUBPIX_VAR(128, 32)
1011 #endif  // CONFIG_EXT_PARTITION
1012 #endif  // CONFIG_EXT_PARTITION_TYPES
1013 #endif  // CONFIG_HIGHBITDEPTH
1014 #endif  // CONFIG_AV1
1015 
1016 #if CONFIG_AV1 && CONFIG_MOTION_VAR
obmc_variance(const uint8_t * pre,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1017 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
1018                                  const int32_t *wsrc, const int32_t *mask,
1019                                  int w, int h, unsigned int *sse, int *sum) {
1020   int i, j;
1021 
1022   *sse = 0;
1023   *sum = 0;
1024 
1025   for (i = 0; i < h; i++) {
1026     for (j = 0; j < w; j++) {
1027       int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1028       *sum += diff;
1029       *sse += diff * diff;
1030     }
1031 
1032     pre += pre_stride;
1033     wsrc += w;
1034     mask += w;
1035   }
1036 }
1037 
1038 #define OBMC_VAR(W, H)                                            \
1039   unsigned int aom_obmc_variance##W##x##H##_c(                    \
1040       const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
1041       const int32_t *mask, unsigned int *sse) {                   \
1042     int sum;                                                      \
1043     obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);  \
1044     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
1045   }
1046 
1047 #define OBMC_SUBPIX_VAR(W, H)                                               \
1048   unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c(                    \
1049       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,         \
1050       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {        \
1051     uint16_t fdata3[(H + 1) * W];                                           \
1052     uint8_t temp2[H * W];                                                   \
1053                                                                             \
1054     var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, H + 1, W, \
1055                                       bilinear_filters_2t[xoffset]);        \
1056     var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,           \
1057                                        bilinear_filters_2t[yoffset]);       \
1058                                                                             \
1059     return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);       \
1060   }
1061 
1062 OBMC_VAR(4, 4)
1063 OBMC_SUBPIX_VAR(4, 4)
1064 
1065 OBMC_VAR(4, 8)
1066 OBMC_SUBPIX_VAR(4, 8)
1067 
1068 OBMC_VAR(8, 4)
1069 OBMC_SUBPIX_VAR(8, 4)
1070 
1071 OBMC_VAR(8, 8)
1072 OBMC_SUBPIX_VAR(8, 8)
1073 
1074 OBMC_VAR(8, 16)
1075 OBMC_SUBPIX_VAR(8, 16)
1076 
1077 OBMC_VAR(16, 8)
1078 OBMC_SUBPIX_VAR(16, 8)
1079 
1080 OBMC_VAR(16, 16)
1081 OBMC_SUBPIX_VAR(16, 16)
1082 
1083 OBMC_VAR(16, 32)
1084 OBMC_SUBPIX_VAR(16, 32)
1085 
1086 OBMC_VAR(32, 16)
1087 OBMC_SUBPIX_VAR(32, 16)
1088 
1089 OBMC_VAR(32, 32)
1090 OBMC_SUBPIX_VAR(32, 32)
1091 
1092 OBMC_VAR(32, 64)
1093 OBMC_SUBPIX_VAR(32, 64)
1094 
1095 OBMC_VAR(64, 32)
1096 OBMC_SUBPIX_VAR(64, 32)
1097 
1098 OBMC_VAR(64, 64)
1099 OBMC_SUBPIX_VAR(64, 64)
1100 
1101 #if CONFIG_EXT_PARTITION
1102 OBMC_VAR(64, 128)
1103 OBMC_SUBPIX_VAR(64, 128)
1104 
1105 OBMC_VAR(128, 64)
1106 OBMC_SUBPIX_VAR(128, 64)
1107 
1108 OBMC_VAR(128, 128)
1109 OBMC_SUBPIX_VAR(128, 128)
1110 #endif  // CONFIG_EXT_PARTITION
1111 
1112 #if CONFIG_EXT_PARTITION_TYPES
1113 OBMC_VAR(4, 16)
1114 OBMC_SUBPIX_VAR(4, 16)
1115 OBMC_VAR(16, 4)
1116 OBMC_SUBPIX_VAR(16, 4)
1117 OBMC_VAR(8, 32)
1118 OBMC_SUBPIX_VAR(8, 32)
1119 OBMC_VAR(32, 8)
1120 OBMC_SUBPIX_VAR(32, 8)
1121 OBMC_VAR(16, 64)
1122 OBMC_SUBPIX_VAR(16, 64)
1123 OBMC_VAR(64, 16)
1124 OBMC_SUBPIX_VAR(64, 16)
1125 #if CONFIG_EXT_PARTITION
1126 OBMC_VAR(32, 128)
1127 OBMC_SUBPIX_VAR(32, 128)
1128 OBMC_VAR(128, 32)
1129 OBMC_SUBPIX_VAR(128, 32)
1130 #endif  // CONFIG_EXT_PARTITION
1131 #endif  // CONFIG_EXT_PARTITION_TYPES
1132 
1133 #if CONFIG_HIGHBITDEPTH
highbd_obmc_variance64(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,uint64_t * sse,int64_t * sum)1134 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
1135                                           const int32_t *wsrc,
1136                                           const int32_t *mask, int w, int h,
1137                                           uint64_t *sse, int64_t *sum) {
1138   int i, j;
1139   uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
1140 
1141   *sse = 0;
1142   *sum = 0;
1143 
1144   for (i = 0; i < h; i++) {
1145     for (j = 0; j < w; j++) {
1146       int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1147       *sum += diff;
1148       *sse += diff * diff;
1149     }
1150 
1151     pre += pre_stride;
1152     wsrc += w;
1153     mask += w;
1154   }
1155 }
1156 
highbd_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1157 static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
1158                                         const int32_t *wsrc,
1159                                         const int32_t *mask, int w, int h,
1160                                         unsigned int *sse, int *sum) {
1161   int64_t sum64;
1162   uint64_t sse64;
1163   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1164   *sum = (int)sum64;
1165   *sse = (unsigned int)sse64;
1166 }
1167 
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1168 static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
1169                                            const int32_t *wsrc,
1170                                            const int32_t *mask, int w, int h,
1171                                            unsigned int *sse, int *sum) {
1172   int64_t sum64;
1173   uint64_t sse64;
1174   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1175   *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
1176   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
1177 }
1178 
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1179 static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
1180                                            const int32_t *wsrc,
1181                                            const int32_t *mask, int w, int h,
1182                                            unsigned int *sse, int *sum) {
1183   int64_t sum64;
1184   uint64_t sse64;
1185   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1186   *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
1187   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
1188 }
1189 
1190 #define HIGHBD_OBMC_VAR(W, H)                                              \
1191   unsigned int aom_highbd_obmc_variance##W##x##H##_c(                      \
1192       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1193       const int32_t *mask, unsigned int *sse) {                            \
1194     int sum;                                                               \
1195     highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
1196     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
1197   }                                                                        \
1198                                                                            \
1199   unsigned int aom_highbd_10_obmc_variance##W##x##H##_c(                   \
1200       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1201       const int32_t *mask, unsigned int *sse) {                            \
1202     int sum;                                                               \
1203     int64_t var;                                                           \
1204     highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1205     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
1206     return (var >= 0) ? (uint32_t)var : 0;                                 \
1207   }                                                                        \
1208                                                                            \
1209   unsigned int aom_highbd_12_obmc_variance##W##x##H##_c(                   \
1210       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1211       const int32_t *mask, unsigned int *sse) {                            \
1212     int sum;                                                               \
1213     int64_t var;                                                           \
1214     highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1215     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
1216     return (var >= 0) ? (uint32_t)var : 0;                                 \
1217   }
1218 
1219 #define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                           \
1220   unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c(                \
1221       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1222       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1223     uint16_t fdata3[(H + 1) * W];                                              \
1224     uint16_t temp2[H * W];                                                     \
1225                                                                                \
1226     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1227         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1228     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1229         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1230                                                                                \
1231     return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
1232                                                  wsrc, mask, sse);             \
1233   }                                                                            \
1234                                                                                \
1235   unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(             \
1236       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1237       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1238     uint16_t fdata3[(H + 1) * W];                                              \
1239     uint16_t temp2[H * W];                                                     \
1240                                                                                \
1241     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1242         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1243     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1244         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1245                                                                                \
1246     return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1247                                                     W, wsrc, mask, sse);       \
1248   }                                                                            \
1249                                                                                \
1250   unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c(             \
1251       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1252       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1253     uint16_t fdata3[(H + 1) * W];                                              \
1254     uint16_t temp2[H * W];                                                     \
1255                                                                                \
1256     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1257         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1258     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1259         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1260                                                                                \
1261     return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1262                                                     W, wsrc, mask, sse);       \
1263   }
1264 
1265 HIGHBD_OBMC_VAR(4, 4)
1266 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
1267 
1268 HIGHBD_OBMC_VAR(4, 8)
1269 HIGHBD_OBMC_SUBPIX_VAR(4, 8)
1270 
1271 HIGHBD_OBMC_VAR(8, 4)
1272 HIGHBD_OBMC_SUBPIX_VAR(8, 4)
1273 
1274 HIGHBD_OBMC_VAR(8, 8)
1275 HIGHBD_OBMC_SUBPIX_VAR(8, 8)
1276 
1277 HIGHBD_OBMC_VAR(8, 16)
1278 HIGHBD_OBMC_SUBPIX_VAR(8, 16)
1279 
1280 HIGHBD_OBMC_VAR(16, 8)
1281 HIGHBD_OBMC_SUBPIX_VAR(16, 8)
1282 
1283 HIGHBD_OBMC_VAR(16, 16)
1284 HIGHBD_OBMC_SUBPIX_VAR(16, 16)
1285 
1286 HIGHBD_OBMC_VAR(16, 32)
1287 HIGHBD_OBMC_SUBPIX_VAR(16, 32)
1288 
1289 HIGHBD_OBMC_VAR(32, 16)
1290 HIGHBD_OBMC_SUBPIX_VAR(32, 16)
1291 
1292 HIGHBD_OBMC_VAR(32, 32)
1293 HIGHBD_OBMC_SUBPIX_VAR(32, 32)
1294 
1295 HIGHBD_OBMC_VAR(32, 64)
1296 HIGHBD_OBMC_SUBPIX_VAR(32, 64)
1297 
1298 HIGHBD_OBMC_VAR(64, 32)
1299 HIGHBD_OBMC_SUBPIX_VAR(64, 32)
1300 
1301 HIGHBD_OBMC_VAR(64, 64)
1302 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
1303 
1304 #if CONFIG_EXT_PARTITION
1305 HIGHBD_OBMC_VAR(64, 128)
1306 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
1307 
1308 HIGHBD_OBMC_VAR(128, 64)
1309 HIGHBD_OBMC_SUBPIX_VAR(128, 64)
1310 
1311 HIGHBD_OBMC_VAR(128, 128)
1312 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
1313 #endif  // CONFIG_EXT_PARTITION
1314 
1315 #if CONFIG_EXT_PARTITION_TYPES
1316 HIGHBD_OBMC_VAR(4, 16)
1317 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
1318 HIGHBD_OBMC_VAR(16, 4)
1319 HIGHBD_OBMC_SUBPIX_VAR(16, 4)
1320 HIGHBD_OBMC_VAR(8, 32)
1321 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
1322 HIGHBD_OBMC_VAR(32, 8)
1323 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
1324 HIGHBD_OBMC_VAR(16, 64)
1325 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
1326 HIGHBD_OBMC_VAR(64, 16)
1327 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
1328 #if CONFIG_EXT_PARTITION
1329 HIGHBD_OBMC_VAR(32, 128)
1330 HIGHBD_OBMC_SUBPIX_VAR(32, 128)
1331 HIGHBD_OBMC_VAR(128, 32)
1332 HIGHBD_OBMC_SUBPIX_VAR(128, 32)
1333 #endif  // CONFIG_EXT_PARTITION
1334 #endif  // CONFIG_EXT_PARTITION_TYPES
1335 #endif  // CONFIG_HIGHBITDEPTH
1336 #endif  // CONFIG_AV1 && CONFIG_MOTION_VAR
1337