1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 
14 #include "config/aom_dsp_rtcd.h"
15 #include "config/aom_config.h"
16 
17 #include "aom_ports/mem.h"
18 #include "aom/aom_integer.h"
19 
20 #include "aom_dsp/aom_filter.h"
21 #include "aom_dsp/variance.h"
22 
23 // Load 2 sets of 4 bytes when alignment is not guaranteed.
load_unaligned_u8(const uint8_t * buf,int stride)24 static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
25   uint32_t a;
26   uint32x2_t a_u32 = vdup_n_u32(0);
27   if (stride == 4) return vld1_u8(buf);
28   memcpy(&a, buf, 4);
29   buf += stride;
30   a_u32 = vld1_lane_u32(&a, a_u32, 0);
31   memcpy(&a, buf, 4);
32   a_u32 = vld1_lane_u32(&a, a_u32, 1);
33   return vreinterpret_u8_u32(a_u32);
34 }
35 
36 // Process a block exactly 4 wide and a multiple of 2 high.
var_filter_block2d_bil_w4(const uint8_t * src_ptr,uint8_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,const uint8_t * filter)37 static void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
38                                       uint8_t *output_ptr,
39                                       unsigned int src_pixels_per_line,
40                                       int pixel_step,
41                                       unsigned int output_height,
42                                       const uint8_t *filter) {
43   const uint8x8_t f0 = vdup_n_u8(filter[0]);
44   const uint8x8_t f1 = vdup_n_u8(filter[1]);
45   unsigned int i;
46   for (i = 0; i < output_height; i += 2) {
47     const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line);
48     const uint8x8_t src_1 =
49         load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line);
50     const uint16x8_t a = vmull_u8(src_0, f0);
51     const uint16x8_t b = vmlal_u8(a, src_1, f1);
52     const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
53     vst1_u8(output_ptr, out);
54     src_ptr += 2 * src_pixels_per_line;
55     output_ptr += 8;
56   }
57 }
58 
var_filter_block2d_bil_w8(const uint8_t * src_ptr,uint8_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)59 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
60                                       uint8_t *output_ptr,
61                                       unsigned int src_pixels_per_line,
62                                       int pixel_step,
63                                       unsigned int output_height,
64                                       unsigned int output_width,
65                                       const uint8_t *filter) {
66   const uint8x8_t f0 = vdup_n_u8(filter[0]);
67   const uint8x8_t f1 = vdup_n_u8(filter[1]);
68   unsigned int i;
69   for (i = 0; i < output_height; ++i) {
70     const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
71     const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
72     const uint16x8_t a = vmull_u8(src_0, f0);
73     const uint16x8_t b = vmlal_u8(a, src_1, f1);
74     const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
75     vst1_u8(output_ptr, out);
76     // Next row...
77     src_ptr += src_pixels_per_line;
78     output_ptr += output_width;
79   }
80 }
81 
82 // Process a block which is a mutiple of 16 wide and any height.
var_filter_block2d_bil_w16(const uint8_t * src_ptr,uint8_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)83 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
84                                        uint8_t *output_ptr,
85                                        unsigned int src_pixels_per_line,
86                                        int pixel_step,
87                                        unsigned int output_height,
88                                        unsigned int output_width,
89                                        const uint8_t *filter) {
90   const uint8x8_t f0 = vdup_n_u8(filter[0]);
91   const uint8x8_t f1 = vdup_n_u8(filter[1]);
92   unsigned int i, j;
93   for (i = 0; i < output_height; ++i) {
94     for (j = 0; j < output_width; j += 16) {
95       const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
96       const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
97       const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
98       const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
99       const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
100       const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
101       const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
102       const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
103       vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
104     }
105     src_ptr += src_pixels_per_line;
106     output_ptr += output_width;
107   }
108 }
109 
aom_sub_pixel_variance8x8_neon(const uint8_t * src,int src_stride,int xoffset,int yoffset,const uint8_t * dst,int dst_stride,unsigned int * sse)110 unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
111                                             int xoffset, int yoffset,
112                                             const uint8_t *dst, int dst_stride,
113                                             unsigned int *sse) {
114   DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
115   DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
116 
117   var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
118                             bilinear_filters_2t[xoffset]);
119   var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
120                             bilinear_filters_2t[yoffset]);
121   return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
122 }
123 
aom_sub_pixel_variance16x16_neon(const uint8_t * src,int src_stride,int xoffset,int yoffset,const uint8_t * dst,int dst_stride,unsigned int * sse)124 unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src,
125                                               int src_stride, int xoffset,
126                                               int yoffset, const uint8_t *dst,
127                                               int dst_stride,
128                                               unsigned int *sse) {
129   DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
130   DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
131 
132   var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
133                              bilinear_filters_2t[xoffset]);
134   var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
135                              bilinear_filters_2t[yoffset]);
136   return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
137 }
138 
aom_sub_pixel_variance32x32_neon(const uint8_t * src,int src_stride,int xoffset,int yoffset,const uint8_t * dst,int dst_stride,unsigned int * sse)139 unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src,
140                                               int src_stride, int xoffset,
141                                               int yoffset, const uint8_t *dst,
142                                               int dst_stride,
143                                               unsigned int *sse) {
144   DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
145   DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
146 
147   var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
148                              bilinear_filters_2t[xoffset]);
149   var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
150                              bilinear_filters_2t[yoffset]);
151   return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
152 }
153 
aom_sub_pixel_variance64x64_neon(const uint8_t * src,int src_stride,int xoffset,int yoffset,const uint8_t * dst,int dst_stride,unsigned int * sse)154 unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src,
155                                               int src_stride, int xoffset,
156                                               int yoffset, const uint8_t *dst,
157                                               int dst_stride,
158                                               unsigned int *sse) {
159   DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
160   DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
161 
162   var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
163                              bilinear_filters_2t[xoffset]);
164   var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
165                              bilinear_filters_2t[yoffset]);
166   return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
167 }
168 
aom_sub_pixel_variance4x4_neon(const uint8_t * a,int a_stride,int xoffset,int yoffset,const uint8_t * b,int b_stride,uint32_t * sse)169 unsigned int aom_sub_pixel_variance4x4_neon(const uint8_t *a, int a_stride,
170                                             int xoffset, int yoffset,
171                                             const uint8_t *b, int b_stride,
172                                             uint32_t *sse) {
173   uint8_t temp0[4 * (4 + 2)];
174   uint8_t temp1[4 * 4];
175 
176   var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (4 + 2),
177                             bilinear_filters_2t[xoffset]);
178   var_filter_block2d_bil_w4(temp0, temp1, 4, 4, 4,
179                             bilinear_filters_2t[yoffset]);
180 
181   return aom_variance4x4(temp1, 4, b, b_stride, sse);
182 }
183 
aom_sub_pixel_variance4x8_neon(const uint8_t * a,int a_stride,int xoffset,int yoffset,const uint8_t * b,int b_stride,uint32_t * sse)184 unsigned int aom_sub_pixel_variance4x8_neon(const uint8_t *a, int a_stride,
185                                             int xoffset, int yoffset,
186                                             const uint8_t *b, int b_stride,
187                                             uint32_t *sse) {
188   uint8_t temp0[4 * (8 + 2)];
189   uint8_t temp1[4 * 8];
190 
191   var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (8 + 2),
192                             bilinear_filters_2t[xoffset]);
193   var_filter_block2d_bil_w4(temp0, temp1, 4, 4, 8,
194                             bilinear_filters_2t[yoffset]);
195 
196   return aom_variance4x8(temp1, 4, b, b_stride, sse);
197 }
198 
aom_sub_pixel_variance8x4_neon(const uint8_t * a,int a_stride,int xoffset,int yoffset,const uint8_t * b,int b_stride,uint32_t * sse)199 unsigned int aom_sub_pixel_variance8x4_neon(const uint8_t *a, int a_stride,
200                                             int xoffset, int yoffset,
201                                             const uint8_t *b, int b_stride,
202                                             uint32_t *sse) {
203   uint8_t temp0[8 * (4 + 1)];
204   uint8_t temp1[8 * 4];
205 
206   var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (4 + 1), 8,
207                             bilinear_filters_2t[xoffset]);
208   var_filter_block2d_bil_w8(temp0, temp1, 8, 8, 4, 8,
209                             bilinear_filters_2t[yoffset]);
210 
211   return aom_variance8x4(temp1, 8, b, b_stride, sse);
212 }
213 
aom_sub_pixel_variance8x16_neon(const uint8_t * a,int a_stride,int xoffset,int yoffset,const uint8_t * b,int b_stride,uint32_t * sse)214 unsigned int aom_sub_pixel_variance8x16_neon(const uint8_t *a, int a_stride,
215                                              int xoffset, int yoffset,
216                                              const uint8_t *b, int b_stride,
217                                              uint32_t *sse) {
218   uint8_t temp0[8 * (16 + 1)];
219   uint8_t temp1[8 * 16];
220 
221   var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (16 + 1), 8,
222                             bilinear_filters_2t[xoffset]);
223   var_filter_block2d_bil_w8(temp0, temp1, 8, 8, 16, 8,
224                             bilinear_filters_2t[yoffset]);
225 
226   return aom_variance8x16(temp1, 8, b, b_stride, sse);
227 }
228 
aom_sub_pixel_variance16x8_neon(const uint8_t * a,int a_stride,int xoffset,int yoffset,const uint8_t * b,int b_stride,uint32_t * sse)229 unsigned int aom_sub_pixel_variance16x8_neon(const uint8_t *a, int a_stride,
230                                              int xoffset, int yoffset,
231                                              const uint8_t *b, int b_stride,
232                                              uint32_t *sse) {
233   uint8_t temp0[16 * (8 + 1)];
234   uint8_t temp1[16 * 8];
235 
236   var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (8 + 1), 16,
237                              bilinear_filters_2t[xoffset]);
238   var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 8, 16,
239                              bilinear_filters_2t[yoffset]);
240 
241   return aom_variance16x8(temp1, 16, b, b_stride, sse);
242 }
243 
aom_sub_pixel_variance16x32_neon(const uint8_t * a,int a_stride,int xoffset,int yoffset,const uint8_t * b,int b_stride,uint32_t * sse)244 unsigned int aom_sub_pixel_variance16x32_neon(const uint8_t *a, int a_stride,
245                                               int xoffset, int yoffset,
246                                               const uint8_t *b, int b_stride,
247                                               uint32_t *sse) {
248   uint8_t temp0[16 * (32 + 1)];
249   uint8_t temp1[16 * 32];
250 
251   var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (32 + 1), 16,
252                              bilinear_filters_2t[xoffset]);
253   var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 32, 16,
254                              bilinear_filters_2t[yoffset]);
255 
256   return aom_variance16x32(temp1, 16, b, b_stride, sse);
257 }
258 
aom_sub_pixel_variance32x16_neon(const uint8_t * a,int a_stride,int xoffset,int yoffset,const uint8_t * b,int b_stride,uint32_t * sse)259 unsigned int aom_sub_pixel_variance32x16_neon(const uint8_t *a, int a_stride,
260                                               int xoffset, int yoffset,
261                                               const uint8_t *b, int b_stride,
262                                               uint32_t *sse) {
263   uint8_t temp0[32 * (16 + 1)];
264   uint8_t temp1[32 * 16];
265 
266   var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (16 + 1), 32,
267                              bilinear_filters_2t[xoffset]);
268   var_filter_block2d_bil_w16(temp0, temp1, 32, 32, 16, 32,
269                              bilinear_filters_2t[yoffset]);
270 
271   return aom_variance32x16(temp1, 32, b, b_stride, sse);
272 }
273 
aom_sub_pixel_variance32x64_neon(const uint8_t * a,int a_stride,int xoffset,int yoffset,const uint8_t * b,int b_stride,uint32_t * sse)274 unsigned int aom_sub_pixel_variance32x64_neon(const uint8_t *a, int a_stride,
275                                               int xoffset, int yoffset,
276                                               const uint8_t *b, int b_stride,
277                                               uint32_t *sse) {
278   uint8_t temp0[32 * (64 + 1)];
279   uint8_t temp1[32 * 64];
280 
281   var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (64 + 1), 32,
282                              bilinear_filters_2t[xoffset]);
283   var_filter_block2d_bil_w16(temp0, temp1, 32, 32, 64, 32,
284                              bilinear_filters_2t[yoffset]);
285 
286   return aom_variance32x64(temp1, 32, b, b_stride, sse);
287 }
288 
aom_sub_pixel_variance64x32_neon(const uint8_t * a,int a_stride,int xoffset,int yoffset,const uint8_t * b,int b_stride,uint32_t * sse)289 unsigned int aom_sub_pixel_variance64x32_neon(const uint8_t *a, int a_stride,
290                                               int xoffset, int yoffset,
291                                               const uint8_t *b, int b_stride,
292                                               uint32_t *sse) {
293   uint8_t temp0[64 * (32 + 1)];
294   uint8_t temp1[64 * 32];
295 
296   var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (32 + 1), 64,
297                              bilinear_filters_2t[xoffset]);
298   var_filter_block2d_bil_w16(temp0, temp1, 64, 64, 32, 64,
299                              bilinear_filters_2t[yoffset]);
300 
301   return aom_variance64x32(temp1, 64, b, b_stride, sse);
302 }
303 
aom_sub_pixel_variance64x128_neon(const uint8_t * a,int a_stride,int xoffset,int yoffset,const uint8_t * b,int b_stride,uint32_t * sse)304 unsigned int aom_sub_pixel_variance64x128_neon(const uint8_t *a, int a_stride,
305                                                int xoffset, int yoffset,
306                                                const uint8_t *b, int b_stride,
307                                                uint32_t *sse) {
308   uint8_t temp0[64 * (128 + 1)];
309   uint8_t temp1[64 * 128];
310 
311   var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (128 + 1), 64,
312                              bilinear_filters_2t[xoffset]);
313   var_filter_block2d_bil_w16(temp0, temp1, 64, 64, 128, 64,
314                              bilinear_filters_2t[yoffset]);
315 
316   return aom_variance64x128(temp1, 64, b, b_stride, sse);
317 }
318 
aom_sub_pixel_variance128x64_neon(const uint8_t * a,int a_stride,int xoffset,int yoffset,const uint8_t * b,int b_stride,uint32_t * sse)319 unsigned int aom_sub_pixel_variance128x64_neon(const uint8_t *a, int a_stride,
320                                                int xoffset, int yoffset,
321                                                const uint8_t *b, int b_stride,
322                                                uint32_t *sse) {
323   uint8_t temp0[128 * (64 + 1)];
324   uint8_t temp1[128 * 64];
325 
326   var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (64 + 1), 128,
327                              bilinear_filters_2t[xoffset]);
328   var_filter_block2d_bil_w16(temp0, temp1, 128, 128, 64, 128,
329                              bilinear_filters_2t[yoffset]);
330 
331   return aom_variance128x64(temp1, 128, b, b_stride, sse);
332 }
333 
aom_sub_pixel_variance128x128_neon(const uint8_t * a,int a_stride,int xoffset,int yoffset,const uint8_t * b,int b_stride,uint32_t * sse)334 unsigned int aom_sub_pixel_variance128x128_neon(const uint8_t *a, int a_stride,
335                                                 int xoffset, int yoffset,
336                                                 const uint8_t *b, int b_stride,
337                                                 uint32_t *sse) {
338   uint8_t temp0[128 * (128 + 1)];
339   uint8_t temp1[128 * 128];
340 
341   var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (128 + 1), 128,
342                              bilinear_filters_2t[xoffset]);
343   var_filter_block2d_bil_w16(temp0, temp1, 128, 128, 128, 128,
344                              bilinear_filters_2t[yoffset]);
345 
346   return aom_variance128x128(temp1, 128, b, b_stride, sse);
347 }
348 
349 // Realtime mode doesn't use 4x rectangular blocks.
350 #if !CONFIG_REALTIME_ONLY
aom_sub_pixel_variance4x16_neon(const uint8_t * a,int a_stride,int xoffset,int yoffset,const uint8_t * b,int b_stride,uint32_t * sse)351 unsigned int aom_sub_pixel_variance4x16_neon(const uint8_t *a, int a_stride,
352                                              int xoffset, int yoffset,
353                                              const uint8_t *b, int b_stride,
354                                              uint32_t *sse) {
355   uint8_t temp0[4 * (16 + 2)];
356   uint8_t temp1[4 * 16];
357 
358   var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (16 + 2),
359                             bilinear_filters_2t[xoffset]);
360   var_filter_block2d_bil_w4(temp0, temp1, 4, 4, 16,
361                             bilinear_filters_2t[yoffset]);
362 
363   return aom_variance4x16(temp1, 4, b, b_stride, sse);
364 }
365 
aom_sub_pixel_variance8x32_neon(const uint8_t * a,int a_stride,int xoffset,int yoffset,const uint8_t * b,int b_stride,uint32_t * sse)366 unsigned int aom_sub_pixel_variance8x32_neon(const uint8_t *a, int a_stride,
367                                              int xoffset, int yoffset,
368                                              const uint8_t *b, int b_stride,
369                                              uint32_t *sse) {
370   uint8_t temp0[8 * (32 + 1)];
371   uint8_t temp1[8 * 32];
372 
373   var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (32 + 1), 8,
374                             bilinear_filters_2t[xoffset]);
375   var_filter_block2d_bil_w8(temp0, temp1, 8, 8, 32, 8,
376                             bilinear_filters_2t[yoffset]);
377 
378   return aom_variance8x32(temp1, 8, b, b_stride, sse);
379 }
380 
aom_sub_pixel_variance16x4_neon(const uint8_t * a,int a_stride,int xoffset,int yoffset,const uint8_t * b,int b_stride,uint32_t * sse)381 unsigned int aom_sub_pixel_variance16x4_neon(const uint8_t *a, int a_stride,
382                                              int xoffset, int yoffset,
383                                              const uint8_t *b, int b_stride,
384                                              uint32_t *sse) {
385   uint8_t temp0[16 * (4 + 1)];
386   uint8_t temp1[16 * 4];
387 
388   var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (4 + 1), 16,
389                              bilinear_filters_2t[xoffset]);
390   var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 4, 16,
391                              bilinear_filters_2t[yoffset]);
392 
393   return aom_variance16x4(temp1, 16, b, b_stride, sse);
394 }
395 
aom_sub_pixel_variance64x16_neon(const uint8_t * a,int a_stride,int xoffset,int yoffset,const uint8_t * b,int b_stride,uint32_t * sse)396 unsigned int aom_sub_pixel_variance64x16_neon(const uint8_t *a, int a_stride,
397                                               int xoffset, int yoffset,
398                                               const uint8_t *b, int b_stride,
399                                               uint32_t *sse) {
400   uint8_t temp0[64 * (16 + 1)];
401   uint8_t temp1[64 * 16];
402 
403   var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (16 + 1), 64,
404                              bilinear_filters_2t[xoffset]);
405   var_filter_block2d_bil_w16(temp0, temp1, 64, 64, 16, 64,
406                              bilinear_filters_2t[yoffset]);
407 
408   return aom_variance64x16(temp1, 64, b, b_stride, sse);
409 }
410 
aom_sub_pixel_variance16x64_neon(const uint8_t * a,int a_stride,int xoffset,int yoffset,const uint8_t * b,int b_stride,uint32_t * sse)411 unsigned int aom_sub_pixel_variance16x64_neon(const uint8_t *a, int a_stride,
412                                               int xoffset, int yoffset,
413                                               const uint8_t *b, int b_stride,
414                                               uint32_t *sse) {
415   uint8_t temp0[16 * (64 + 1)];
416   uint8_t temp1[16 * 64];
417 
418   var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (64 + 1), 16,
419                              bilinear_filters_2t[xoffset]);
420   var_filter_block2d_bil_w16(temp0, temp1, 16, 16, 64, 16,
421                              bilinear_filters_2t[yoffset]);
422 
423   return aom_variance16x64(temp1, 16, b, b_stride, sse);
424 }
425 
aom_sub_pixel_variance32x8_neon(const uint8_t * a,int a_stride,int xoffset,int yoffset,const uint8_t * b,int b_stride,uint32_t * sse)426 unsigned int aom_sub_pixel_variance32x8_neon(const uint8_t *a, int a_stride,
427                                              int xoffset, int yoffset,
428                                              const uint8_t *b, int b_stride,
429                                              uint32_t *sse) {
430   uint8_t temp0[32 * (8 + 1)];
431   uint8_t temp1[32 * 8];
432 
433   var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (8 + 1), 32,
434                              bilinear_filters_2t[xoffset]);
435   var_filter_block2d_bil_w16(temp0, temp1, 32, 32, 8, 32,
436                              bilinear_filters_2t[yoffset]);
437 
438   return aom_variance32x8(temp1, 32, b, b_stride, sse);
439 }
440 #endif  // !CONFIG_REALTIME_ONLY
441