1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_config.h"
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_ports/mem.h"
14 
15 typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
16                                        const uint16_t *ref, int ref_stride,
17                                        uint32_t *sse, int *sum);
18 
19 uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
20                                     const uint16_t *ref, int ref_stride,
21                                     uint32_t *sse, int *sum);
22 
23 uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
24                                       const uint16_t *ref, int ref_stride,
25                                       uint32_t *sse, int *sum);
26 
highbd_8_variance_sse2(const uint16_t * src,int src_stride,const uint16_t * ref,int ref_stride,int w,int h,uint32_t * sse,int * sum,high_variance_fn_t var_fn,int block_size)27 static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
28                                    const uint16_t *ref, int ref_stride, int w,
29                                    int h, uint32_t *sse, int *sum,
30                                    high_variance_fn_t var_fn, int block_size) {
31   int i, j;
32 
33   *sse = 0;
34   *sum = 0;
35 
36   for (i = 0; i < h; i += block_size) {
37     for (j = 0; j < w; j += block_size) {
38       unsigned int sse0;
39       int sum0;
40       var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
41              ref_stride, &sse0, &sum0);
42       *sse += sse0;
43       *sum += sum0;
44     }
45   }
46 }
47 
highbd_10_variance_sse2(const uint16_t * src,int src_stride,const uint16_t * ref,int ref_stride,int w,int h,uint32_t * sse,int * sum,high_variance_fn_t var_fn,int block_size)48 static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
49                                     const uint16_t *ref, int ref_stride, int w,
50                                     int h, uint32_t *sse, int *sum,
51                                     high_variance_fn_t var_fn, int block_size) {
52   int i, j;
53   uint64_t sse_long = 0;
54   int32_t sum_long = 0;
55 
56   for (i = 0; i < h; i += block_size) {
57     for (j = 0; j < w; j += block_size) {
58       unsigned int sse0;
59       int sum0;
60       var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
61              ref_stride, &sse0, &sum0);
62       sse_long += sse0;
63       sum_long += sum0;
64     }
65   }
66   *sum = ROUND_POWER_OF_TWO(sum_long, 2);
67   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
68 }
69 
highbd_12_variance_sse2(const uint16_t * src,int src_stride,const uint16_t * ref,int ref_stride,int w,int h,uint32_t * sse,int * sum,high_variance_fn_t var_fn,int block_size)70 static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
71                                     const uint16_t *ref, int ref_stride, int w,
72                                     int h, uint32_t *sse, int *sum,
73                                     high_variance_fn_t var_fn, int block_size) {
74   int i, j;
75   uint64_t sse_long = 0;
76   int32_t sum_long = 0;
77 
78   for (i = 0; i < h; i += block_size) {
79     for (j = 0; j < w; j += block_size) {
80       unsigned int sse0;
81       int sum0;
82       var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
83              ref_stride, &sse0, &sum0);
84       sse_long += sse0;
85       sum_long += sum0;
86     }
87   }
88   *sum = ROUND_POWER_OF_TWO(sum_long, 4);
89   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
90 }
91 
92 #define HIGH_GET_VAR(S)                                                       \
93   void vpx_highbd_8_get##S##x##S##var_sse2(                                   \
94       const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
95       int ref_stride, uint32_t *sse, int *sum) {                              \
96     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
97     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
98     vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
99                                        sum);                                  \
100   }                                                                           \
101                                                                               \
102   void vpx_highbd_10_get##S##x##S##var_sse2(                                  \
103       const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
104       int ref_stride, uint32_t *sse, int *sum) {                              \
105     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
106     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
107     vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
108                                        sum);                                  \
109     *sum = ROUND_POWER_OF_TWO(*sum, 2);                                       \
110     *sse = ROUND_POWER_OF_TWO(*sse, 4);                                       \
111   }                                                                           \
112                                                                               \
113   void vpx_highbd_12_get##S##x##S##var_sse2(                                  \
114       const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
115       int ref_stride, uint32_t *sse, int *sum) {                              \
116     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
117     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
118     vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
119                                        sum);                                  \
120     *sum = ROUND_POWER_OF_TWO(*sum, 4);                                       \
121     *sse = ROUND_POWER_OF_TWO(*sse, 8);                                       \
122   }
123 
124 HIGH_GET_VAR(16);
125 HIGH_GET_VAR(8);
126 
127 #undef HIGH_GET_VAR
128 
129 #define VAR_FN(w, h, block_size, shift)                                    \
130   uint32_t vpx_highbd_8_variance##w##x##h##_sse2(                          \
131       const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
132       int ref_stride, uint32_t *sse) {                                     \
133     int sum;                                                               \
134     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
135     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
136     highbd_8_variance_sse2(                                                \
137         src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
138         vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
139     return *sse - (uint32_t)(((int64_t)sum * sum) >> (shift));             \
140   }                                                                        \
141                                                                            \
142   uint32_t vpx_highbd_10_variance##w##x##h##_sse2(                         \
143       const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
144       int ref_stride, uint32_t *sse) {                                     \
145     int sum;                                                               \
146     int64_t var;                                                           \
147     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
148     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
149     highbd_10_variance_sse2(                                               \
150         src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
151         vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
152     var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift));             \
153     return (var >= 0) ? (uint32_t)var : 0;                                 \
154   }                                                                        \
155                                                                            \
156   uint32_t vpx_highbd_12_variance##w##x##h##_sse2(                         \
157       const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
158       int ref_stride, uint32_t *sse) {                                     \
159     int sum;                                                               \
160     int64_t var;                                                           \
161     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
162     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
163     highbd_12_variance_sse2(                                               \
164         src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
165         vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
166     var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift));             \
167     return (var >= 0) ? (uint32_t)var : 0;                                 \
168   }
169 
170 VAR_FN(64, 64, 16, 12);
171 VAR_FN(64, 32, 16, 11);
172 VAR_FN(32, 64, 16, 11);
173 VAR_FN(32, 32, 16, 10);
174 VAR_FN(32, 16, 16, 9);
175 VAR_FN(16, 32, 16, 9);
176 VAR_FN(16, 16, 16, 8);
177 VAR_FN(16, 8, 8, 7);
178 VAR_FN(8, 16, 8, 7);
179 VAR_FN(8, 8, 8, 6);
180 
181 #undef VAR_FN
182 
vpx_highbd_8_mse16x16_sse2(const uint8_t * src8,int src_stride,const uint8_t * ref8,int ref_stride,unsigned int * sse)183 unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
184                                         const uint8_t *ref8, int ref_stride,
185                                         unsigned int *sse) {
186   int sum;
187   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
188   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
189   highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
190                          vpx_highbd_calc16x16var_sse2, 16);
191   return *sse;
192 }
193 
vpx_highbd_10_mse16x16_sse2(const uint8_t * src8,int src_stride,const uint8_t * ref8,int ref_stride,unsigned int * sse)194 unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
195                                          const uint8_t *ref8, int ref_stride,
196                                          unsigned int *sse) {
197   int sum;
198   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
199   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
200   highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
201                           vpx_highbd_calc16x16var_sse2, 16);
202   return *sse;
203 }
204 
vpx_highbd_12_mse16x16_sse2(const uint8_t * src8,int src_stride,const uint8_t * ref8,int ref_stride,unsigned int * sse)205 unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
206                                          const uint8_t *ref8, int ref_stride,
207                                          unsigned int *sse) {
208   int sum;
209   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
210   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
211   highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
212                           vpx_highbd_calc16x16var_sse2, 16);
213   return *sse;
214 }
215 
vpx_highbd_8_mse8x8_sse2(const uint8_t * src8,int src_stride,const uint8_t * ref8,int ref_stride,unsigned int * sse)216 unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
217                                       const uint8_t *ref8, int ref_stride,
218                                       unsigned int *sse) {
219   int sum;
220   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
221   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
222   highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
223                          vpx_highbd_calc8x8var_sse2, 8);
224   return *sse;
225 }
226 
vpx_highbd_10_mse8x8_sse2(const uint8_t * src8,int src_stride,const uint8_t * ref8,int ref_stride,unsigned int * sse)227 unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
228                                        const uint8_t *ref8, int ref_stride,
229                                        unsigned int *sse) {
230   int sum;
231   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
232   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
233   highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
234                           vpx_highbd_calc8x8var_sse2, 8);
235   return *sse;
236 }
237 
vpx_highbd_12_mse8x8_sse2(const uint8_t * src8,int src_stride,const uint8_t * ref8,int ref_stride,unsigned int * sse)238 unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
239                                        const uint8_t *ref8, int ref_stride,
240                                        unsigned int *sse) {
241   int sum;
242   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
243   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
244   highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
245                           vpx_highbd_calc8x8var_sse2, 8);
246   return *sse;
247 }
248 
249 // The 2 unused parameters are place holders for PIC enabled build.
250 // These definitions are for functions defined in
251 // highbd_subpel_variance_impl_sse2.asm
252 #define DECL(w, opt)                                                         \
253   int vpx_highbd_sub_pixel_variance##w##xh_##opt(                            \
254       const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
255       const uint16_t *ref, ptrdiff_t ref_stride, int height,                 \
256       unsigned int *sse, void *unused0, void *unused);
257 #define DECLS(opt) \
258   DECL(8, opt);    \
259   DECL(16, opt)
260 
261 DECLS(sse2);
262 
263 #undef DECLS
264 #undef DECL
265 
266 #define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
267   uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(                   \
268       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
269       const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) {                \
270     uint32_t sse;                                                              \
271     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
272     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
273     int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                      \
274         src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL,   \
275         NULL);                                                                 \
276     if (w > wf) {                                                              \
277       unsigned int sse2;                                                       \
278       int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
279           src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h,   \
280           &sse2, NULL, NULL);                                                  \
281       se += se2;                                                               \
282       sse += sse2;                                                             \
283       if (w > wf * 2) {                                                        \
284         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
285             src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \
286             &sse2, NULL, NULL);                                                \
287         se += se2;                                                             \
288         sse += sse2;                                                           \
289         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
290             src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \
291             &sse2, NULL, NULL);                                                \
292         se += se2;                                                             \
293         sse += sse2;                                                           \
294       }                                                                        \
295     }                                                                          \
296     *sse_ptr = sse;                                                            \
297     return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
298   }                                                                            \
299                                                                                \
300   uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt(                  \
301       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
302       const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) {                \
303     int64_t var;                                                               \
304     uint32_t sse;                                                              \
305     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
306     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
307     int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                      \
308         src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL,   \
309         NULL);                                                                 \
310     if (w > wf) {                                                              \
311       uint32_t sse2;                                                           \
312       int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
313           src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h,   \
314           &sse2, NULL, NULL);                                                  \
315       se += se2;                                                               \
316       sse += sse2;                                                             \
317       if (w > wf * 2) {                                                        \
318         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
319             src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \
320             &sse2, NULL, NULL);                                                \
321         se += se2;                                                             \
322         sse += sse2;                                                           \
323         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
324             src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \
325             &sse2, NULL, NULL);                                                \
326         se += se2;                                                             \
327         sse += sse2;                                                           \
328       }                                                                        \
329     }                                                                          \
330     se = ROUND_POWER_OF_TWO(se, 2);                                            \
331     sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
332     *sse_ptr = sse;                                                            \
333     var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
334     return (var >= 0) ? (uint32_t)var : 0;                                     \
335   }                                                                            \
336                                                                                \
337   uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt(                  \
338       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
339       const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) {                \
340     int start_row;                                                             \
341     uint32_t sse;                                                              \
342     int se = 0;                                                                \
343     int64_t var;                                                               \
344     uint64_t long_sse = 0;                                                     \
345     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
346     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
347     for (start_row = 0; start_row < h; start_row += 16) {                      \
348       uint32_t sse2;                                                           \
349       int height = h - start_row < 16 ? h - start_row : 16;                    \
350       int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
351           src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
352           ref + (start_row * ref_stride), ref_stride, height, &sse2, NULL,     \
353           NULL);                                                               \
354       se += se2;                                                               \
355       long_sse += sse2;                                                        \
356       if (w > wf) {                                                            \
357         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
358             src + 16 + (start_row * src_stride), src_stride, x_offset,         \
359             y_offset, ref + 16 + (start_row * ref_stride), ref_stride, height, \
360             &sse2, NULL, NULL);                                                \
361         se += se2;                                                             \
362         long_sse += sse2;                                                      \
363         if (w > wf * 2) {                                                      \
364           se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
365               src + 32 + (start_row * src_stride), src_stride, x_offset,       \
366               y_offset, ref + 32 + (start_row * ref_stride), ref_stride,       \
367               height, &sse2, NULL, NULL);                                      \
368           se += se2;                                                           \
369           long_sse += sse2;                                                    \
370           se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
371               src + 48 + (start_row * src_stride), src_stride, x_offset,       \
372               y_offset, ref + 48 + (start_row * ref_stride), ref_stride,       \
373               height, &sse2, NULL, NULL);                                      \
374           se += se2;                                                           \
375           long_sse += sse2;                                                    \
376         }                                                                      \
377       }                                                                        \
378     }                                                                          \
379     se = ROUND_POWER_OF_TWO(se, 4);                                            \
380     sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
381     *sse_ptr = sse;                                                            \
382     var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
383     return (var >= 0) ? (uint32_t)var : 0;                                     \
384   }
385 
386 #define FNS(opt)                        \
387   FN(64, 64, 16, 6, 6, opt, (int64_t)); \
388   FN(64, 32, 16, 6, 5, opt, (int64_t)); \
389   FN(32, 64, 16, 5, 6, opt, (int64_t)); \
390   FN(32, 32, 16, 5, 5, opt, (int64_t)); \
391   FN(32, 16, 16, 5, 4, opt, (int64_t)); \
392   FN(16, 32, 16, 4, 5, opt, (int64_t)); \
393   FN(16, 16, 16, 4, 4, opt, (int64_t)); \
394   FN(16, 8, 16, 4, 3, opt, (int64_t));  \
395   FN(8, 16, 8, 3, 4, opt, (int64_t));   \
396   FN(8, 8, 8, 3, 3, opt, (int64_t));    \
397   FN(8, 4, 8, 3, 2, opt, (int64_t));
398 
399 FNS(sse2);
400 
401 #undef FNS
402 #undef FN
403 
404 // The 2 unused parameters are place holders for PIC enabled build.
405 #define DECL(w, opt)                                                         \
406   int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(                        \
407       const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
408       const uint16_t *ref, ptrdiff_t ref_stride, const uint16_t *second,     \
409       ptrdiff_t second_stride, int height, unsigned int *sse, void *unused0, \
410       void *unused);
411 #define DECLS(opt1) \
412   DECL(16, opt1)    \
413   DECL(8, opt1)
414 
415 DECLS(sse2);
416 #undef DECL
417 #undef DECLS
418 
419 #define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
420   uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt(               \
421       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
422       const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr,                  \
423       const uint8_t *sec8) {                                                   \
424     uint32_t sse;                                                              \
425     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
426     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
427     uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
428     int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
429         src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \
430         NULL, NULL);                                                           \
431     if (w > wf) {                                                              \
432       uint32_t sse2;                                                           \
433       int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
434           src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride,      \
435           sec + 16, w, h, &sse2, NULL, NULL);                                  \
436       se += se2;                                                               \
437       sse += sse2;                                                             \
438       if (w > wf * 2) {                                                        \
439         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
440             src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride,    \
441             sec + 32, w, h, &sse2, NULL, NULL);                                \
442         se += se2;                                                             \
443         sse += sse2;                                                           \
444         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
445             src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride,    \
446             sec + 48, w, h, &sse2, NULL, NULL);                                \
447         se += se2;                                                             \
448         sse += sse2;                                                           \
449       }                                                                        \
450     }                                                                          \
451     *sse_ptr = sse;                                                            \
452     return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
453   }                                                                            \
454                                                                                \
455   uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt(              \
456       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
457       const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr,                  \
458       const uint8_t *sec8) {                                                   \
459     int64_t var;                                                               \
460     uint32_t sse;                                                              \
461     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
462     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
463     uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
464     int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
465         src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \
466         NULL, NULL);                                                           \
467     if (w > wf) {                                                              \
468       uint32_t sse2;                                                           \
469       int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
470           src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride,      \
471           sec + 16, w, h, &sse2, NULL, NULL);                                  \
472       se += se2;                                                               \
473       sse += sse2;                                                             \
474       if (w > wf * 2) {                                                        \
475         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
476             src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride,    \
477             sec + 32, w, h, &sse2, NULL, NULL);                                \
478         se += se2;                                                             \
479         sse += sse2;                                                           \
480         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
481             src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride,    \
482             sec + 48, w, h, &sse2, NULL, NULL);                                \
483         se += se2;                                                             \
484         sse += sse2;                                                           \
485       }                                                                        \
486     }                                                                          \
487     se = ROUND_POWER_OF_TWO(se, 2);                                            \
488     sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
489     *sse_ptr = sse;                                                            \
490     var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
491     return (var >= 0) ? (uint32_t)var : 0;                                     \
492   }                                                                            \
493                                                                                \
494   uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt(              \
495       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
496       const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr,                  \
497       const uint8_t *sec8) {                                                   \
498     int start_row;                                                             \
499     int64_t var;                                                               \
500     uint32_t sse;                                                              \
501     int se = 0;                                                                \
502     uint64_t long_sse = 0;                                                     \
503     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
504     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
505     uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
506     for (start_row = 0; start_row < h; start_row += 16) {                      \
507       uint32_t sse2;                                                           \
508       int height = h - start_row < 16 ? h - start_row : 16;                    \
509       int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
510           src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
511           ref + (start_row * ref_stride), ref_stride, sec + (start_row * w),   \
512           w, height, &sse2, NULL, NULL);                                       \
513       se += se2;                                                               \
514       long_sse += sse2;                                                        \
515       if (w > wf) {                                                            \
516         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
517             src + 16 + (start_row * src_stride), src_stride, x_offset,         \
518             y_offset, ref + 16 + (start_row * ref_stride), ref_stride,         \
519             sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL);         \
520         se += se2;                                                             \
521         long_sse += sse2;                                                      \
522         if (w > wf * 2) {                                                      \
523           se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
524               src + 32 + (start_row * src_stride), src_stride, x_offset,       \
525               y_offset, ref + 32 + (start_row * ref_stride), ref_stride,       \
526               sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL);       \
527           se += se2;                                                           \
528           long_sse += sse2;                                                    \
529           se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
530               src + 48 + (start_row * src_stride), src_stride, x_offset,       \
531               y_offset, ref + 48 + (start_row * ref_stride), ref_stride,       \
532               sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL);       \
533           se += se2;                                                           \
534           long_sse += sse2;                                                    \
535         }                                                                      \
536       }                                                                        \
537     }                                                                          \
538     se = ROUND_POWER_OF_TWO(se, 4);                                            \
539     sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
540     *sse_ptr = sse;                                                            \
541     var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
542     return (var >= 0) ? (uint32_t)var : 0;                                     \
543   }
544 
545 #define FNS(opt1)                        \
546   FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
547   FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
548   FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
549   FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
550   FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
551   FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
552   FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
553   FN(16, 8, 16, 4, 3, opt1, (int64_t));  \
554   FN(8, 16, 8, 4, 3, opt1, (int64_t));   \
555   FN(8, 8, 8, 3, 3, opt1, (int64_t));    \
556   FN(8, 4, 8, 3, 2, opt1, (int64_t));
557 
558 FNS(sse2);
559 
560 #undef FNS
561 #undef FN
562