1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <stdlib.h>
12 #include <string.h>
13 #include <assert.h>
14
15 #include "./aom_config.h"
16 #include "./aom_dsp_rtcd.h"
17
18 #include "aom_ports/mem.h"
19 #include "aom/aom_integer.h"
20
21 #include "aom_dsp/variance.h"
22 #include "aom_dsp/aom_filter.h"
23 #include "aom_dsp/blend.h"
24
25 #include "./av1_rtcd.h"
26 #include "av1/common/filter.h"
27
aom_get4x4sse_cs_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride)28 uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
29 int b_stride) {
30 int distortion = 0;
31 int r, c;
32
33 for (r = 0; r < 4; ++r) {
34 for (c = 0; c < 4; ++c) {
35 int diff = a[c] - b[c];
36 distortion += diff * diff;
37 }
38
39 a += a_stride;
40 b += b_stride;
41 }
42
43 return distortion;
44 }
45
aom_get_mb_ss_c(const int16_t * a)46 uint32_t aom_get_mb_ss_c(const int16_t *a) {
47 unsigned int i, sum = 0;
48
49 for (i = 0; i < 256; ++i) {
50 sum += a[i] * a[i];
51 }
52
53 return sum;
54 }
55
aom_variance_halfpixvar16x16_h_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,uint32_t * sse)56 uint32_t aom_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,
57 const uint8_t *b, int b_stride,
58 uint32_t *sse) {
59 return aom_sub_pixel_variance16x16_c(a, a_stride, 4, 0, b, b_stride, sse);
60 }
61
aom_variance_halfpixvar16x16_v_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,uint32_t * sse)62 uint32_t aom_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,
63 const uint8_t *b, int b_stride,
64 uint32_t *sse) {
65 return aom_sub_pixel_variance16x16_c(a, a_stride, 0, 4, b, b_stride, sse);
66 }
67
aom_variance_halfpixvar16x16_hv_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,uint32_t * sse)68 uint32_t aom_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,
69 const uint8_t *b, int b_stride,
70 uint32_t *sse) {
71 return aom_sub_pixel_variance16x16_c(a, a_stride, 4, 4, b, b_stride, sse);
72 }
73
variance(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h,uint32_t * sse,int * sum)74 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
75 int b_stride, int w, int h, uint32_t *sse, int *sum) {
76 int i, j;
77
78 *sum = 0;
79 *sse = 0;
80
81 for (i = 0; i < h; ++i) {
82 for (j = 0; j < w; ++j) {
83 const int diff = a[j] - b[j];
84 *sum += diff;
85 *sse += diff * diff;
86 }
87
88 a += a_stride;
89 b += b_stride;
90 }
91 }
92
aom_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)93 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
94 int b_stride, int w, int h) {
95 uint32_t sse;
96 int sum;
97 variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
98 return sse;
99 }
100
101 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
102 // or vertical direction to produce the filtered output block. Used to implement
103 // the first-pass of 2-D separable filter.
104 //
105 // Produces int16_t output to retain precision for the next pass. Two filter
106 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
107 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
108 // It defines the offset required to move from one input to the next.
var_filter_block2d_bil_first_pass(const uint8_t * a,uint16_t * b,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)109 static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
110 unsigned int src_pixels_per_line,
111 int pixel_step,
112 unsigned int output_height,
113 unsigned int output_width,
114 const uint8_t *filter) {
115 unsigned int i, j;
116
117 for (i = 0; i < output_height; ++i) {
118 for (j = 0; j < output_width; ++j) {
119 b[j] = ROUND_POWER_OF_TWO(
120 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
121
122 ++a;
123 }
124
125 a += src_pixels_per_line - output_width;
126 b += output_width;
127 }
128 }
129
130 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
131 // or vertical direction to produce the filtered output block. Used to implement
132 // the second-pass of 2-D separable filter.
133 //
134 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
135 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
136 // filter is applied horizontally (pixel_step = 1) or vertically
137 // (pixel_step = stride). It defines the offset required to move from one input
138 // to the next. Output is 8-bit.
var_filter_block2d_bil_second_pass(const uint16_t * a,uint8_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)139 static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
140 unsigned int src_pixels_per_line,
141 unsigned int pixel_step,
142 unsigned int output_height,
143 unsigned int output_width,
144 const uint8_t *filter) {
145 unsigned int i, j;
146
147 for (i = 0; i < output_height; ++i) {
148 for (j = 0; j < output_width; ++j) {
149 b[j] = ROUND_POWER_OF_TWO(
150 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
151 ++a;
152 }
153
154 a += src_pixels_per_line - output_width;
155 b += output_width;
156 }
157 }
158
159 #define VAR(W, H) \
160 uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
161 const uint8_t *b, int b_stride, \
162 uint32_t *sse) { \
163 int sum; \
164 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
165 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
166 }
167
168 #define SUBPIX_VAR(W, H) \
169 uint32_t aom_sub_pixel_variance##W##x##H##_c( \
170 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
171 const uint8_t *b, int b_stride, uint32_t *sse) { \
172 uint16_t fdata3[(H + 1) * W]; \
173 uint8_t temp2[H * W]; \
174 \
175 var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
176 bilinear_filters_2t[xoffset]); \
177 var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
178 bilinear_filters_2t[yoffset]); \
179 \
180 return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
181 }
182
183 #define SUBPIX_AVG_VAR(W, H) \
184 uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \
185 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
186 const uint8_t *b, int b_stride, uint32_t *sse, \
187 const uint8_t *second_pred) { \
188 uint16_t fdata3[(H + 1) * W]; \
189 uint8_t temp2[H * W]; \
190 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
191 \
192 var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
193 bilinear_filters_2t[xoffset]); \
194 var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
195 bilinear_filters_2t[yoffset]); \
196 \
197 aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
198 \
199 return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
200 }
201
202 /* Identical to the variance call except it takes an additional parameter, sum,
203 * and returns that value using pass-by-reference instead of returning
204 * sse - sum^2 / w*h
205 */
206 #define GET_VAR(W, H) \
207 void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
208 const uint8_t *b, int b_stride, uint32_t *sse, \
209 int *sum) { \
210 variance(a, a_stride, b, b_stride, W, H, sse, sum); \
211 }
212
213 /* Identical to the variance call except it does not calculate the
214 * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
215 * variable.
216 */
217 #define MSE(W, H) \
218 uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
219 const uint8_t *b, int b_stride, \
220 uint32_t *sse) { \
221 int sum; \
222 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
223 return *sse; \
224 }
225
226 /* All three forms of the variance are available in the same sizes. */
227 #define VARIANCES(W, H) \
228 VAR(W, H) \
229 SUBPIX_VAR(W, H) \
230 SUBPIX_AVG_VAR(W, H)
231
232 #if CONFIG_AV1 && CONFIG_EXT_PARTITION
233 VARIANCES(128, 128)
234 VARIANCES(128, 64)
235 VARIANCES(64, 128)
236 #endif // CONFIG_AV1 && CONFIG_EXT_PARTITION
237 VARIANCES(64, 64)
238 VARIANCES(64, 32)
239 VARIANCES(32, 64)
240 VARIANCES(32, 32)
241 VARIANCES(32, 16)
242 VARIANCES(16, 32)
243 VARIANCES(16, 16)
244 VARIANCES(16, 8)
245 VARIANCES(8, 16)
246 VARIANCES(8, 8)
247 VARIANCES(8, 4)
248 VARIANCES(4, 8)
249 VARIANCES(4, 4)
250 VARIANCES(4, 2)
251 VARIANCES(2, 4)
252 VARIANCES(2, 2)
253
254 #if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
255 VARIANCES(4, 16)
256 VARIANCES(16, 4)
257 VARIANCES(8, 32)
258 VARIANCES(32, 8)
259 VARIANCES(16, 64)
260 VARIANCES(64, 16)
261 #if CONFIG_EXT_PARTITION
262 VARIANCES(32, 128)
263 VARIANCES(128, 32)
264 #endif // CONFIG_EXT_PARTITION
265 #endif // CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
266
267 GET_VAR(16, 16)
268 GET_VAR(8, 8)
269
270 MSE(16, 16)
271 MSE(16, 8)
272 MSE(8, 16)
273 MSE(8, 8)
274
aom_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride)275 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
276 int height, const uint8_t *ref, int ref_stride) {
277 int i, j;
278
279 for (i = 0; i < height; ++i) {
280 for (j = 0; j < width; ++j) {
281 const int tmp = pred[j] + ref[j];
282 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
283 }
284 comp_pred += width;
285 pred += width;
286 ref += ref_stride;
287 }
288 }
289
290 // Get pred block from up-sampled reference.
aom_upsampled_pred_c(uint8_t * comp_pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride)291 void aom_upsampled_pred_c(uint8_t *comp_pred, int width, int height,
292 int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
293 int ref_stride) {
294 if (!subpel_x_q3 && !subpel_y_q3) {
295 int i;
296 for (i = 0; i < height; i++) {
297 memcpy(comp_pred, ref, width * sizeof(*comp_pred));
298 comp_pred += width;
299 ref += ref_stride;
300 }
301 } else {
302 InterpFilterParams filter;
303 filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
304 if (!subpel_y_q3) {
305 const int16_t *kernel;
306 kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
307 /*Directly call C version to allow this to work for small (2x2) sizes.*/
308 aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
309 -1, width, height);
310 } else if (!subpel_x_q3) {
311 const int16_t *kernel;
312 kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
313 /*Directly call C version to allow this to work for small (2x2) sizes.*/
314 aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
315 16, width, height);
316 } else {
317 DECLARE_ALIGNED(16, uint8_t,
318 temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
319 const int16_t *kernel_x;
320 const int16_t *kernel_y;
321 int intermediate_height;
322 kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
323 kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
324 intermediate_height =
325 (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
326 assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
327 /*Directly call C versions to allow this to work for small (2x2) sizes.*/
328 aom_convolve8_horiz_c(ref - ref_stride * ((filter.taps >> 1) - 1),
329 ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL,
330 -1, width, intermediate_height);
331 aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
332 MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y,
333 16, width, height);
334 }
335 }
336 }
337
aom_comp_avg_upsampled_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride)338 void aom_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
339 int width, int height, int subpel_x_q3,
340 int subpel_y_q3, const uint8_t *ref,
341 int ref_stride) {
342 int i, j;
343
344 aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
345 ref_stride);
346 for (i = 0; i < height; i++) {
347 for (j = 0; j < width; j++) {
348 comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
349 }
350 comp_pred += width;
351 pred += width;
352 }
353 }
354
355 #if CONFIG_HIGHBITDEPTH
highbd_variance64(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint64_t * sse,int64_t * sum)356 static void highbd_variance64(const uint8_t *a8, int a_stride,
357 const uint8_t *b8, int b_stride, int w, int h,
358 uint64_t *sse, int64_t *sum) {
359 int i, j;
360
361 uint16_t *a = CONVERT_TO_SHORTPTR(a8);
362 uint16_t *b = CONVERT_TO_SHORTPTR(b8);
363 *sum = 0;
364 *sse = 0;
365
366 for (i = 0; i < h; ++i) {
367 for (j = 0; j < w; ++j) {
368 const int diff = a[j] - b[j];
369 *sum += diff;
370 *sse += diff * diff;
371 }
372 a += a_stride;
373 b += b_stride;
374 }
375 }
376
aom_highbd_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)377 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
378 const uint8_t *b, int b_stride, int w, int h) {
379 uint64_t sse;
380 int64_t sum;
381 highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
382 return sse;
383 }
384
highbd_8_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)385 static void highbd_8_variance(const uint8_t *a8, int a_stride,
386 const uint8_t *b8, int b_stride, int w, int h,
387 uint32_t *sse, int *sum) {
388 uint64_t sse_long = 0;
389 int64_t sum_long = 0;
390 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
391 *sse = (uint32_t)sse_long;
392 *sum = (int)sum_long;
393 }
394
highbd_10_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)395 static void highbd_10_variance(const uint8_t *a8, int a_stride,
396 const uint8_t *b8, int b_stride, int w, int h,
397 uint32_t *sse, int *sum) {
398 uint64_t sse_long = 0;
399 int64_t sum_long = 0;
400 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
401 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
402 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
403 }
404
highbd_12_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)405 static void highbd_12_variance(const uint8_t *a8, int a_stride,
406 const uint8_t *b8, int b_stride, int w, int h,
407 uint32_t *sse, int *sum) {
408 uint64_t sse_long = 0;
409 int64_t sum_long = 0;
410 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
411 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
412 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
413 }
414
415 #define HIGHBD_VAR(W, H) \
416 uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
417 const uint8_t *b, int b_stride, \
418 uint32_t *sse) { \
419 int sum; \
420 highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
421 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
422 } \
423 \
424 uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
425 const uint8_t *b, int b_stride, \
426 uint32_t *sse) { \
427 int sum; \
428 int64_t var; \
429 highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
430 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
431 return (var >= 0) ? (uint32_t)var : 0; \
432 } \
433 \
434 uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
435 const uint8_t *b, int b_stride, \
436 uint32_t *sse) { \
437 int sum; \
438 int64_t var; \
439 highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
440 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
441 return (var >= 0) ? (uint32_t)var : 0; \
442 }
443
444 #define HIGHBD_GET_VAR(S) \
445 void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
446 const uint8_t *ref, int ref_stride, \
447 uint32_t *sse, int *sum) { \
448 highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
449 } \
450 \
451 void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
452 const uint8_t *ref, int ref_stride, \
453 uint32_t *sse, int *sum) { \
454 highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
455 } \
456 \
457 void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
458 const uint8_t *ref, int ref_stride, \
459 uint32_t *sse, int *sum) { \
460 highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
461 }
462
463 #define HIGHBD_MSE(W, H) \
464 uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
465 const uint8_t *ref, int ref_stride, \
466 uint32_t *sse) { \
467 int sum; \
468 highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
469 return *sse; \
470 } \
471 \
472 uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
473 const uint8_t *ref, int ref_stride, \
474 uint32_t *sse) { \
475 int sum; \
476 highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
477 return *sse; \
478 } \
479 \
480 uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
481 const uint8_t *ref, int ref_stride, \
482 uint32_t *sse) { \
483 int sum; \
484 highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
485 return *sse; \
486 }
487
aom_highbd_var_filter_block2d_bil_first_pass(const uint8_t * src_ptr8,uint16_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)488 void aom_highbd_var_filter_block2d_bil_first_pass(
489 const uint8_t *src_ptr8, uint16_t *output_ptr,
490 unsigned int src_pixels_per_line, int pixel_step,
491 unsigned int output_height, unsigned int output_width,
492 const uint8_t *filter) {
493 unsigned int i, j;
494 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
495 for (i = 0; i < output_height; ++i) {
496 for (j = 0; j < output_width; ++j) {
497 output_ptr[j] = ROUND_POWER_OF_TWO(
498 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
499 FILTER_BITS);
500
501 ++src_ptr;
502 }
503
504 // Next row...
505 src_ptr += src_pixels_per_line - output_width;
506 output_ptr += output_width;
507 }
508 }
509
aom_highbd_var_filter_block2d_bil_second_pass(const uint16_t * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)510 void aom_highbd_var_filter_block2d_bil_second_pass(
511 const uint16_t *src_ptr, uint16_t *output_ptr,
512 unsigned int src_pixels_per_line, unsigned int pixel_step,
513 unsigned int output_height, unsigned int output_width,
514 const uint8_t *filter) {
515 unsigned int i, j;
516
517 for (i = 0; i < output_height; ++i) {
518 for (j = 0; j < output_width; ++j) {
519 output_ptr[j] = ROUND_POWER_OF_TWO(
520 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
521 FILTER_BITS);
522 ++src_ptr;
523 }
524
525 src_ptr += src_pixels_per_line - output_width;
526 output_ptr += output_width;
527 }
528 }
529
530 #define HIGHBD_SUBPIX_VAR(W, H) \
531 uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c( \
532 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
533 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
534 uint16_t fdata3[(H + 1) * W]; \
535 uint16_t temp2[H * W]; \
536 \
537 aom_highbd_var_filter_block2d_bil_first_pass( \
538 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
539 aom_highbd_var_filter_block2d_bil_second_pass( \
540 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
541 \
542 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
543 dst, dst_stride, sse); \
544 } \
545 \
546 uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c( \
547 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
548 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
549 uint16_t fdata3[(H + 1) * W]; \
550 uint16_t temp2[H * W]; \
551 \
552 aom_highbd_var_filter_block2d_bil_first_pass( \
553 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
554 aom_highbd_var_filter_block2d_bil_second_pass( \
555 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
556 \
557 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
558 dst, dst_stride, sse); \
559 } \
560 \
561 uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c( \
562 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
563 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
564 uint16_t fdata3[(H + 1) * W]; \
565 uint16_t temp2[H * W]; \
566 \
567 aom_highbd_var_filter_block2d_bil_first_pass( \
568 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
569 aom_highbd_var_filter_block2d_bil_second_pass( \
570 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
571 \
572 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
573 dst, dst_stride, sse); \
574 }
575
576 #define HIGHBD_SUBPIX_AVG_VAR(W, H) \
577 uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
578 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
579 const uint8_t *dst, int dst_stride, uint32_t *sse, \
580 const uint8_t *second_pred) { \
581 uint16_t fdata3[(H + 1) * W]; \
582 uint16_t temp2[H * W]; \
583 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
584 \
585 aom_highbd_var_filter_block2d_bil_first_pass( \
586 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
587 aom_highbd_var_filter_block2d_bil_second_pass( \
588 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
589 \
590 aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
591 CONVERT_TO_BYTEPTR(temp2), W); \
592 \
593 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
594 dst, dst_stride, sse); \
595 } \
596 \
597 uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
598 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
599 const uint8_t *dst, int dst_stride, uint32_t *sse, \
600 const uint8_t *second_pred) { \
601 uint16_t fdata3[(H + 1) * W]; \
602 uint16_t temp2[H * W]; \
603 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
604 \
605 aom_highbd_var_filter_block2d_bil_first_pass( \
606 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
607 aom_highbd_var_filter_block2d_bil_second_pass( \
608 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
609 \
610 aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
611 CONVERT_TO_BYTEPTR(temp2), W); \
612 \
613 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
614 dst, dst_stride, sse); \
615 } \
616 \
617 uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
618 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
619 const uint8_t *dst, int dst_stride, uint32_t *sse, \
620 const uint8_t *second_pred) { \
621 uint16_t fdata3[(H + 1) * W]; \
622 uint16_t temp2[H * W]; \
623 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
624 \
625 aom_highbd_var_filter_block2d_bil_first_pass( \
626 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
627 aom_highbd_var_filter_block2d_bil_second_pass( \
628 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
629 \
630 aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
631 CONVERT_TO_BYTEPTR(temp2), W); \
632 \
633 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
634 dst, dst_stride, sse); \
635 }
636
637 /* All three forms of the variance are available in the same sizes. */
638 #define HIGHBD_VARIANCES(W, H) \
639 HIGHBD_VAR(W, H) \
640 HIGHBD_SUBPIX_VAR(W, H) \
641 HIGHBD_SUBPIX_AVG_VAR(W, H)
642
643 #if CONFIG_AV1 && CONFIG_EXT_PARTITION
644 HIGHBD_VARIANCES(128, 128)
645 HIGHBD_VARIANCES(128, 64)
646 HIGHBD_VARIANCES(64, 128)
647 #endif // CONFIG_AV1 && CONFIG_EXT_PARTITION
648 HIGHBD_VARIANCES(64, 64)
649 HIGHBD_VARIANCES(64, 32)
650 HIGHBD_VARIANCES(32, 64)
651 HIGHBD_VARIANCES(32, 32)
652 HIGHBD_VARIANCES(32, 16)
653 HIGHBD_VARIANCES(16, 32)
654 HIGHBD_VARIANCES(16, 16)
655 HIGHBD_VARIANCES(16, 8)
656 HIGHBD_VARIANCES(8, 16)
657 HIGHBD_VARIANCES(8, 8)
658 HIGHBD_VARIANCES(8, 4)
659 HIGHBD_VARIANCES(4, 8)
660 HIGHBD_VARIANCES(4, 4)
661 HIGHBD_VARIANCES(4, 2)
662 HIGHBD_VARIANCES(2, 4)
663 HIGHBD_VARIANCES(2, 2)
664
665 #if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
666 HIGHBD_VARIANCES(4, 16)
667 HIGHBD_VARIANCES(16, 4)
668 HIGHBD_VARIANCES(8, 32)
669 HIGHBD_VARIANCES(32, 8)
670 HIGHBD_VARIANCES(16, 64)
671 HIGHBD_VARIANCES(64, 16)
672 #if CONFIG_EXT_PARTITION
673 HIGHBD_VARIANCES(32, 128)
674 HIGHBD_VARIANCES(128, 32)
675 #endif // CONFIG_EXT_PARTITION
676 #endif // CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
677
678 HIGHBD_GET_VAR(8)
679 HIGHBD_GET_VAR(16)
680
681 HIGHBD_MSE(16, 16)
682 HIGHBD_MSE(16, 8)
683 HIGHBD_MSE(8, 16)
684 HIGHBD_MSE(8, 8)
685
aom_highbd_comp_avg_pred_c(uint16_t * comp_pred,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride)686 void aom_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
687 int width, int height, const uint8_t *ref8,
688 int ref_stride) {
689 int i, j;
690 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
691 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
692 for (i = 0; i < height; ++i) {
693 for (j = 0; j < width; ++j) {
694 const int tmp = pred[j] + ref[j];
695 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
696 }
697 comp_pred += width;
698 pred += width;
699 ref += ref_stride;
700 }
701 }
702
aom_highbd_upsampled_pred_c(uint16_t * comp_pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd)703 void aom_highbd_upsampled_pred_c(uint16_t *comp_pred, int width, int height,
704 int subpel_x_q3, int subpel_y_q3,
705 const uint8_t *ref8, int ref_stride, int bd) {
706 if (!subpel_x_q3 && !subpel_y_q3) {
707 const uint16_t *ref;
708 int i;
709 ref = CONVERT_TO_SHORTPTR(ref8);
710 for (i = 0; i < height; i++) {
711 memcpy(comp_pred, ref, width * sizeof(*comp_pred));
712 comp_pred += width;
713 ref += ref_stride;
714 }
715 } else {
716 InterpFilterParams filter;
717 filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
718 if (!subpel_y_q3) {
719 const int16_t *kernel;
720 kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
721 /*Directly call C version to allow this to work for small (2x2) sizes.*/
722 aom_highbd_convolve8_horiz_c(ref8, ref_stride,
723 CONVERT_TO_BYTEPTR(comp_pred), width, kernel,
724 16, NULL, -1, width, height, bd);
725 } else if (!subpel_x_q3) {
726 const int16_t *kernel;
727 kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
728 /*Directly call C version to allow this to work for small (2x2) sizes.*/
729 aom_highbd_convolve8_vert_c(ref8, ref_stride,
730 CONVERT_TO_BYTEPTR(comp_pred), width, NULL,
731 -1, kernel, 16, width, height, bd);
732 } else {
733 DECLARE_ALIGNED(16, uint16_t,
734 temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
735 const int16_t *kernel_x;
736 const int16_t *kernel_y;
737 int intermediate_height;
738 kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
739 kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
740 intermediate_height =
741 (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
742 assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
743 /*Directly call C versions to allow this to work for small (2x2) sizes.*/
744 aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter.taps >> 1) - 1),
745 ref_stride, CONVERT_TO_BYTEPTR(temp),
746 MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
747 intermediate_height, bd);
748 aom_highbd_convolve8_vert_c(
749 CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
750 MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
751 16, width, height, bd);
752 }
753 }
754 }
755
aom_highbd_comp_avg_upsampled_pred_c(uint16_t * comp_pred,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd)756 void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred,
757 const uint8_t *pred8, int width,
758 int height, int subpel_x_q3,
759 int subpel_y_q3, const uint8_t *ref8,
760 int ref_stride, int bd) {
761 int i, j;
762
763 const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
764 aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3,
765 ref8, ref_stride, bd);
766 for (i = 0; i < height; ++i) {
767 for (j = 0; j < width; ++j) {
768 comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
769 }
770 comp_pred += width;
771 pred += width;
772 }
773 }
774 #endif // CONFIG_HIGHBITDEPTH
775
776 #if CONFIG_AV1
aom_comp_mask_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)777 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
778 int height, const uint8_t *ref, int ref_stride,
779 const uint8_t *mask, int mask_stride,
780 int invert_mask) {
781 int i, j;
782
783 for (i = 0; i < height; ++i) {
784 for (j = 0; j < width; ++j) {
785 if (!invert_mask)
786 comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
787 else
788 comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
789 }
790 comp_pred += width;
791 pred += width;
792 ref += ref_stride;
793 mask += mask_stride;
794 }
795 }
796
aom_comp_mask_upsampled_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)797 void aom_comp_mask_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
798 int width, int height, int subpel_x_q3,
799 int subpel_y_q3, const uint8_t *ref,
800 int ref_stride, const uint8_t *mask,
801 int mask_stride, int invert_mask) {
802 int i, j;
803
804 aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
805 ref_stride);
806 for (i = 0; i < height; i++) {
807 for (j = 0; j < width; j++) {
808 if (!invert_mask)
809 comp_pred[j] = AOM_BLEND_A64(mask[j], comp_pred[j], pred[j]);
810 else
811 comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], comp_pred[j]);
812 }
813 comp_pred += width;
814 pred += width;
815 mask += mask_stride;
816 }
817 }
818
819 #define MASK_SUBPIX_VAR(W, H) \
820 unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \
821 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
822 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
823 const uint8_t *msk, int msk_stride, int invert_mask, \
824 unsigned int *sse) { \
825 uint16_t fdata3[(H + 1) * W]; \
826 uint8_t temp2[H * W]; \
827 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
828 \
829 var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
830 bilinear_filters_2t[xoffset]); \
831 var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
832 bilinear_filters_2t[yoffset]); \
833 \
834 aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
835 invert_mask); \
836 return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \
837 }
838
839 MASK_SUBPIX_VAR(4, 4)
840 MASK_SUBPIX_VAR(4, 8)
841 MASK_SUBPIX_VAR(8, 4)
842 MASK_SUBPIX_VAR(8, 8)
843 MASK_SUBPIX_VAR(8, 16)
844 MASK_SUBPIX_VAR(16, 8)
845 MASK_SUBPIX_VAR(16, 16)
846 MASK_SUBPIX_VAR(16, 32)
847 MASK_SUBPIX_VAR(32, 16)
848 MASK_SUBPIX_VAR(32, 32)
849 MASK_SUBPIX_VAR(32, 64)
850 MASK_SUBPIX_VAR(64, 32)
851 MASK_SUBPIX_VAR(64, 64)
852 #if CONFIG_EXT_PARTITION
853 MASK_SUBPIX_VAR(64, 128)
854 MASK_SUBPIX_VAR(128, 64)
855 MASK_SUBPIX_VAR(128, 128)
856 #endif // CONFIG_EXT_PARTITION
857
858 #if CONFIG_EXT_PARTITION_TYPES
859 MASK_SUBPIX_VAR(4, 16)
860 MASK_SUBPIX_VAR(16, 4)
861 MASK_SUBPIX_VAR(8, 32)
862 MASK_SUBPIX_VAR(32, 8)
863 MASK_SUBPIX_VAR(16, 64)
864 MASK_SUBPIX_VAR(64, 16)
865 #if CONFIG_EXT_PARTITION
866 MASK_SUBPIX_VAR(32, 128)
867 MASK_SUBPIX_VAR(128, 32)
868 #endif // CONFIG_EXT_PARTITION
869 #endif // CONFIG_EXT_PARTITION_TYPES
870
871 #if CONFIG_HIGHBITDEPTH
aom_highbd_comp_mask_pred_c(uint16_t * comp_pred,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)872 void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
873 int width, int height, const uint8_t *ref8,
874 int ref_stride, const uint8_t *mask,
875 int mask_stride, int invert_mask) {
876 int i, j;
877 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
878 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
879 for (i = 0; i < height; ++i) {
880 for (j = 0; j < width; ++j) {
881 if (!invert_mask)
882 comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
883 else
884 comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
885 }
886 comp_pred += width;
887 pred += width;
888 ref += ref_stride;
889 mask += mask_stride;
890 }
891 }
892
aom_highbd_comp_mask_upsampled_pred_c(uint16_t * comp_pred,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask,int bd)893 void aom_highbd_comp_mask_upsampled_pred_c(
894 uint16_t *comp_pred, const uint8_t *pred8, int width, int height,
895 int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride,
896 const uint8_t *mask, int mask_stride, int invert_mask, int bd) {
897 int i, j;
898
899 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
900 aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3,
901 ref8, ref_stride, bd);
902 for (i = 0; i < height; ++i) {
903 for (j = 0; j < width; ++j) {
904 if (!invert_mask)
905 comp_pred[j] = AOM_BLEND_A64(mask[j], comp_pred[j], pred[j]);
906 else
907 comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], comp_pred[j]);
908 }
909 comp_pred += width;
910 pred += width;
911 mask += mask_stride;
912 }
913 }
914
915 #define HIGHBD_MASK_SUBPIX_VAR(W, H) \
916 unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c( \
917 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
918 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
919 const uint8_t *msk, int msk_stride, int invert_mask, \
920 unsigned int *sse) { \
921 uint16_t fdata3[(H + 1) * W]; \
922 uint16_t temp2[H * W]; \
923 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
924 \
925 aom_highbd_var_filter_block2d_bil_first_pass( \
926 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
927 aom_highbd_var_filter_block2d_bil_second_pass( \
928 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
929 \
930 aom_highbd_comp_mask_pred_c(temp3, second_pred, W, H, \
931 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
932 invert_mask); \
933 \
934 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
935 ref, ref_stride, sse); \
936 } \
937 \
938 unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \
939 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
940 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
941 const uint8_t *msk, int msk_stride, int invert_mask, \
942 unsigned int *sse) { \
943 uint16_t fdata3[(H + 1) * W]; \
944 uint16_t temp2[H * W]; \
945 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
946 \
947 aom_highbd_var_filter_block2d_bil_first_pass( \
948 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
949 aom_highbd_var_filter_block2d_bil_second_pass( \
950 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
951 \
952 aom_highbd_comp_mask_pred_c(temp3, second_pred, W, H, \
953 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
954 invert_mask); \
955 \
956 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
957 ref, ref_stride, sse); \
958 } \
959 \
960 unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \
961 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
962 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
963 const uint8_t *msk, int msk_stride, int invert_mask, \
964 unsigned int *sse) { \
965 uint16_t fdata3[(H + 1) * W]; \
966 uint16_t temp2[H * W]; \
967 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
968 \
969 aom_highbd_var_filter_block2d_bil_first_pass( \
970 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
971 aom_highbd_var_filter_block2d_bil_second_pass( \
972 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
973 \
974 aom_highbd_comp_mask_pred_c(temp3, second_pred, W, H, \
975 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
976 invert_mask); \
977 \
978 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
979 ref, ref_stride, sse); \
980 }
981
982 HIGHBD_MASK_SUBPIX_VAR(4, 4)
983 HIGHBD_MASK_SUBPIX_VAR(4, 8)
984 HIGHBD_MASK_SUBPIX_VAR(8, 4)
985 HIGHBD_MASK_SUBPIX_VAR(8, 8)
986 HIGHBD_MASK_SUBPIX_VAR(8, 16)
987 HIGHBD_MASK_SUBPIX_VAR(16, 8)
988 HIGHBD_MASK_SUBPIX_VAR(16, 16)
989 HIGHBD_MASK_SUBPIX_VAR(16, 32)
990 HIGHBD_MASK_SUBPIX_VAR(32, 16)
991 HIGHBD_MASK_SUBPIX_VAR(32, 32)
992 HIGHBD_MASK_SUBPIX_VAR(32, 64)
993 HIGHBD_MASK_SUBPIX_VAR(64, 32)
994 HIGHBD_MASK_SUBPIX_VAR(64, 64)
995 #if CONFIG_EXT_PARTITION
996 HIGHBD_MASK_SUBPIX_VAR(64, 128)
997 HIGHBD_MASK_SUBPIX_VAR(128, 64)
998 HIGHBD_MASK_SUBPIX_VAR(128, 128)
999 #endif // CONFIG_EXT_PARTITION
1000
1001 #if CONFIG_EXT_PARTITION_TYPES
1002 HIGHBD_MASK_SUBPIX_VAR(4, 16)
1003 HIGHBD_MASK_SUBPIX_VAR(16, 4)
1004 HIGHBD_MASK_SUBPIX_VAR(8, 32)
1005 HIGHBD_MASK_SUBPIX_VAR(32, 8)
1006 HIGHBD_MASK_SUBPIX_VAR(16, 64)
1007 HIGHBD_MASK_SUBPIX_VAR(64, 16)
1008 #if CONFIG_EXT_PARTITION
1009 HIGHBD_MASK_SUBPIX_VAR(32, 128)
1010 HIGHBD_MASK_SUBPIX_VAR(128, 32)
1011 #endif // CONFIG_EXT_PARTITION
1012 #endif // CONFIG_EXT_PARTITION_TYPES
1013 #endif // CONFIG_HIGHBITDEPTH
1014 #endif // CONFIG_AV1
1015
1016 #if CONFIG_AV1 && CONFIG_MOTION_VAR
obmc_variance(const uint8_t * pre,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1017 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
1018 const int32_t *wsrc, const int32_t *mask,
1019 int w, int h, unsigned int *sse, int *sum) {
1020 int i, j;
1021
1022 *sse = 0;
1023 *sum = 0;
1024
1025 for (i = 0; i < h; i++) {
1026 for (j = 0; j < w; j++) {
1027 int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1028 *sum += diff;
1029 *sse += diff * diff;
1030 }
1031
1032 pre += pre_stride;
1033 wsrc += w;
1034 mask += w;
1035 }
1036 }
1037
1038 #define OBMC_VAR(W, H) \
1039 unsigned int aom_obmc_variance##W##x##H##_c( \
1040 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1041 const int32_t *mask, unsigned int *sse) { \
1042 int sum; \
1043 obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1044 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
1045 }
1046
1047 #define OBMC_SUBPIX_VAR(W, H) \
1048 unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \
1049 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1050 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1051 uint16_t fdata3[(H + 1) * W]; \
1052 uint8_t temp2[H * W]; \
1053 \
1054 var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, H + 1, W, \
1055 bilinear_filters_2t[xoffset]); \
1056 var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
1057 bilinear_filters_2t[yoffset]); \
1058 \
1059 return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \
1060 }
1061
1062 OBMC_VAR(4, 4)
1063 OBMC_SUBPIX_VAR(4, 4)
1064
1065 OBMC_VAR(4, 8)
1066 OBMC_SUBPIX_VAR(4, 8)
1067
1068 OBMC_VAR(8, 4)
1069 OBMC_SUBPIX_VAR(8, 4)
1070
1071 OBMC_VAR(8, 8)
1072 OBMC_SUBPIX_VAR(8, 8)
1073
1074 OBMC_VAR(8, 16)
1075 OBMC_SUBPIX_VAR(8, 16)
1076
1077 OBMC_VAR(16, 8)
1078 OBMC_SUBPIX_VAR(16, 8)
1079
1080 OBMC_VAR(16, 16)
1081 OBMC_SUBPIX_VAR(16, 16)
1082
1083 OBMC_VAR(16, 32)
1084 OBMC_SUBPIX_VAR(16, 32)
1085
1086 OBMC_VAR(32, 16)
1087 OBMC_SUBPIX_VAR(32, 16)
1088
1089 OBMC_VAR(32, 32)
1090 OBMC_SUBPIX_VAR(32, 32)
1091
1092 OBMC_VAR(32, 64)
1093 OBMC_SUBPIX_VAR(32, 64)
1094
1095 OBMC_VAR(64, 32)
1096 OBMC_SUBPIX_VAR(64, 32)
1097
1098 OBMC_VAR(64, 64)
1099 OBMC_SUBPIX_VAR(64, 64)
1100
1101 #if CONFIG_EXT_PARTITION
1102 OBMC_VAR(64, 128)
1103 OBMC_SUBPIX_VAR(64, 128)
1104
1105 OBMC_VAR(128, 64)
1106 OBMC_SUBPIX_VAR(128, 64)
1107
1108 OBMC_VAR(128, 128)
1109 OBMC_SUBPIX_VAR(128, 128)
1110 #endif // CONFIG_EXT_PARTITION
1111
1112 #if CONFIG_EXT_PARTITION_TYPES
1113 OBMC_VAR(4, 16)
1114 OBMC_SUBPIX_VAR(4, 16)
1115 OBMC_VAR(16, 4)
1116 OBMC_SUBPIX_VAR(16, 4)
1117 OBMC_VAR(8, 32)
1118 OBMC_SUBPIX_VAR(8, 32)
1119 OBMC_VAR(32, 8)
1120 OBMC_SUBPIX_VAR(32, 8)
1121 OBMC_VAR(16, 64)
1122 OBMC_SUBPIX_VAR(16, 64)
1123 OBMC_VAR(64, 16)
1124 OBMC_SUBPIX_VAR(64, 16)
1125 #if CONFIG_EXT_PARTITION
1126 OBMC_VAR(32, 128)
1127 OBMC_SUBPIX_VAR(32, 128)
1128 OBMC_VAR(128, 32)
1129 OBMC_SUBPIX_VAR(128, 32)
1130 #endif // CONFIG_EXT_PARTITION
1131 #endif // CONFIG_EXT_PARTITION_TYPES
1132
1133 #if CONFIG_HIGHBITDEPTH
highbd_obmc_variance64(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,uint64_t * sse,int64_t * sum)1134 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
1135 const int32_t *wsrc,
1136 const int32_t *mask, int w, int h,
1137 uint64_t *sse, int64_t *sum) {
1138 int i, j;
1139 uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
1140
1141 *sse = 0;
1142 *sum = 0;
1143
1144 for (i = 0; i < h; i++) {
1145 for (j = 0; j < w; j++) {
1146 int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1147 *sum += diff;
1148 *sse += diff * diff;
1149 }
1150
1151 pre += pre_stride;
1152 wsrc += w;
1153 mask += w;
1154 }
1155 }
1156
highbd_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1157 static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
1158 const int32_t *wsrc,
1159 const int32_t *mask, int w, int h,
1160 unsigned int *sse, int *sum) {
1161 int64_t sum64;
1162 uint64_t sse64;
1163 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1164 *sum = (int)sum64;
1165 *sse = (unsigned int)sse64;
1166 }
1167
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1168 static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
1169 const int32_t *wsrc,
1170 const int32_t *mask, int w, int h,
1171 unsigned int *sse, int *sum) {
1172 int64_t sum64;
1173 uint64_t sse64;
1174 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1175 *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
1176 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
1177 }
1178
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1179 static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
1180 const int32_t *wsrc,
1181 const int32_t *mask, int w, int h,
1182 unsigned int *sse, int *sum) {
1183 int64_t sum64;
1184 uint64_t sse64;
1185 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1186 *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
1187 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
1188 }
1189
1190 #define HIGHBD_OBMC_VAR(W, H) \
1191 unsigned int aom_highbd_obmc_variance##W##x##H##_c( \
1192 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1193 const int32_t *mask, unsigned int *sse) { \
1194 int sum; \
1195 highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1196 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
1197 } \
1198 \
1199 unsigned int aom_highbd_10_obmc_variance##W##x##H##_c( \
1200 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1201 const int32_t *mask, unsigned int *sse) { \
1202 int sum; \
1203 int64_t var; \
1204 highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1205 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
1206 return (var >= 0) ? (uint32_t)var : 0; \
1207 } \
1208 \
1209 unsigned int aom_highbd_12_obmc_variance##W##x##H##_c( \
1210 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1211 const int32_t *mask, unsigned int *sse) { \
1212 int sum; \
1213 int64_t var; \
1214 highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1215 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
1216 return (var >= 0) ? (uint32_t)var : 0; \
1217 }
1218
1219 #define HIGHBD_OBMC_SUBPIX_VAR(W, H) \
1220 unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c( \
1221 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1222 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1223 uint16_t fdata3[(H + 1) * W]; \
1224 uint16_t temp2[H * W]; \
1225 \
1226 aom_highbd_var_filter_block2d_bil_first_pass( \
1227 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1228 aom_highbd_var_filter_block2d_bil_second_pass( \
1229 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1230 \
1231 return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
1232 wsrc, mask, sse); \
1233 } \
1234 \
1235 unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \
1236 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1237 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1238 uint16_t fdata3[(H + 1) * W]; \
1239 uint16_t temp2[H * W]; \
1240 \
1241 aom_highbd_var_filter_block2d_bil_first_pass( \
1242 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1243 aom_highbd_var_filter_block2d_bil_second_pass( \
1244 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1245 \
1246 return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1247 W, wsrc, mask, sse); \
1248 } \
1249 \
1250 unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \
1251 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1252 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1253 uint16_t fdata3[(H + 1) * W]; \
1254 uint16_t temp2[H * W]; \
1255 \
1256 aom_highbd_var_filter_block2d_bil_first_pass( \
1257 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1258 aom_highbd_var_filter_block2d_bil_second_pass( \
1259 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1260 \
1261 return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1262 W, wsrc, mask, sse); \
1263 }
1264
1265 HIGHBD_OBMC_VAR(4, 4)
1266 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
1267
1268 HIGHBD_OBMC_VAR(4, 8)
1269 HIGHBD_OBMC_SUBPIX_VAR(4, 8)
1270
1271 HIGHBD_OBMC_VAR(8, 4)
1272 HIGHBD_OBMC_SUBPIX_VAR(8, 4)
1273
1274 HIGHBD_OBMC_VAR(8, 8)
1275 HIGHBD_OBMC_SUBPIX_VAR(8, 8)
1276
1277 HIGHBD_OBMC_VAR(8, 16)
1278 HIGHBD_OBMC_SUBPIX_VAR(8, 16)
1279
1280 HIGHBD_OBMC_VAR(16, 8)
1281 HIGHBD_OBMC_SUBPIX_VAR(16, 8)
1282
1283 HIGHBD_OBMC_VAR(16, 16)
1284 HIGHBD_OBMC_SUBPIX_VAR(16, 16)
1285
1286 HIGHBD_OBMC_VAR(16, 32)
1287 HIGHBD_OBMC_SUBPIX_VAR(16, 32)
1288
1289 HIGHBD_OBMC_VAR(32, 16)
1290 HIGHBD_OBMC_SUBPIX_VAR(32, 16)
1291
1292 HIGHBD_OBMC_VAR(32, 32)
1293 HIGHBD_OBMC_SUBPIX_VAR(32, 32)
1294
1295 HIGHBD_OBMC_VAR(32, 64)
1296 HIGHBD_OBMC_SUBPIX_VAR(32, 64)
1297
1298 HIGHBD_OBMC_VAR(64, 32)
1299 HIGHBD_OBMC_SUBPIX_VAR(64, 32)
1300
1301 HIGHBD_OBMC_VAR(64, 64)
1302 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
1303
1304 #if CONFIG_EXT_PARTITION
1305 HIGHBD_OBMC_VAR(64, 128)
1306 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
1307
1308 HIGHBD_OBMC_VAR(128, 64)
1309 HIGHBD_OBMC_SUBPIX_VAR(128, 64)
1310
1311 HIGHBD_OBMC_VAR(128, 128)
1312 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
1313 #endif // CONFIG_EXT_PARTITION
1314
1315 #if CONFIG_EXT_PARTITION_TYPES
1316 HIGHBD_OBMC_VAR(4, 16)
1317 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
1318 HIGHBD_OBMC_VAR(16, 4)
1319 HIGHBD_OBMC_SUBPIX_VAR(16, 4)
1320 HIGHBD_OBMC_VAR(8, 32)
1321 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
1322 HIGHBD_OBMC_VAR(32, 8)
1323 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
1324 HIGHBD_OBMC_VAR(16, 64)
1325 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
1326 HIGHBD_OBMC_VAR(64, 16)
1327 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
1328 #if CONFIG_EXT_PARTITION
1329 HIGHBD_OBMC_VAR(32, 128)
1330 HIGHBD_OBMC_SUBPIX_VAR(32, 128)
1331 HIGHBD_OBMC_VAR(128, 32)
1332 HIGHBD_OBMC_SUBPIX_VAR(128, 32)
1333 #endif // CONFIG_EXT_PARTITION
1334 #endif // CONFIG_EXT_PARTITION_TYPES
1335 #endif // CONFIG_HIGHBITDEPTH
1336 #endif // CONFIG_AV1 && CONFIG_MOTION_VAR
1337