1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <algorithm>
13 #include <vector>
14 
15 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
16 
17 #include "./av1_rtcd.h"
18 #include "./aom_dsp_rtcd.h"
19 #include "aom_dsp/aom_dsp_common.h"
20 #include "aom_ports/mem.h"
21 #include "av1/common/filter.h"
22 #include "av1/common/convolve.h"
23 #include "test/acm_random.h"
24 #include "test/util.h"
25 
26 using libaom_test::ACMRandom;
27 
28 namespace {
29 using std::tr1::tuple;
filter_block1d_horiz_c(const uint8_t * src_ptr,int src_stride,const int16_t * filter,int tap,uint8_t * dst_ptr,int dst_stride,int w,int h)30 static void filter_block1d_horiz_c(const uint8_t *src_ptr, int src_stride,
31                                    const int16_t *filter, int tap,
32                                    uint8_t *dst_ptr, int dst_stride, int w,
33                                    int h) {
34   src_ptr -= tap / 2 - 1;
35   for (int r = 0; r < h; ++r) {
36     for (int c = 0; c < w; ++c) {
37       int sum = 0;
38       for (int i = 0; i < tap; ++i) {
39         sum += src_ptr[c + i] * filter[i];
40       }
41       dst_ptr[c] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
42     }
43     src_ptr += src_stride;
44     dst_ptr += dst_stride;
45   }
46 }
47 
filter_block1d_vert_c(const uint8_t * src_ptr,int src_stride,const int16_t * filter,int tap,uint8_t * dst_ptr,int dst_stride,int w,int h)48 static void filter_block1d_vert_c(const uint8_t *src_ptr, int src_stride,
49                                   const int16_t *filter, int tap,
50                                   uint8_t *dst_ptr, int dst_stride, int w,
51                                   int h) {
52   src_ptr -= (tap / 2 - 1) * src_stride;
53   for (int r = 0; r < h; ++r) {
54     for (int c = 0; c < w; ++c) {
55       int sum = 0;
56       for (int i = 0; i < tap; ++i) {
57         sum += src_ptr[c + i * src_stride] * filter[i];
58       }
59       dst_ptr[c] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
60     }
61     src_ptr += src_stride;
62     dst_ptr += dst_stride;
63   }
64 }
65 
match(const uint8_t * out,int out_stride,const uint8_t * ref_out,int ref_out_stride,int w,int h)66 static int match(const uint8_t *out, int out_stride, const uint8_t *ref_out,
67                  int ref_out_stride, int w, int h) {
68   for (int r = 0; r < h; ++r) {
69     for (int c = 0; c < w; ++c) {
70       if (out[r * out_stride + c] != ref_out[r * ref_out_stride + c]) return 0;
71     }
72   }
73   return 1;
74 }
75 
76 typedef void (*ConvolveFunc)(const uint8_t *src, int src_stride, uint8_t *dst,
77                              int dst_stride, int w, int h,
78                              const InterpFilterParams filter_params,
79                              const int subpel_q4, int step_q4,
80                              ConvolveParams *conv_params);
81 
82 struct ConvolveFunctions {
ConvolveFunctions__anond82d75de0111::ConvolveFunctions83   ConvolveFunctions(ConvolveFunc hf, ConvolveFunc vf) : hf_(hf), vf_(vf) {}
84   ConvolveFunc hf_;
85   ConvolveFunc vf_;
86 };
87 
88 typedef tuple<ConvolveFunctions *, InterpFilter /*filter_x*/,
89               InterpFilter /*filter_y*/>
90     ConvolveParam;
91 
92 class Av1ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
93  public:
SetUp()94   virtual void SetUp() {
95     rnd_(ACMRandom::DeterministicSeed());
96     cfs_ = GET_PARAM(0);
97     interp_filter_ls_[0] = GET_PARAM(2);
98     interp_filter_ls_[2] = interp_filter_ls_[0];
99     interp_filter_ls_[1] = GET_PARAM(1);
100     interp_filter_ls_[3] = interp_filter_ls_[1];
101   }
TearDown()102   virtual void TearDown() {
103     while (buf_ls_.size() > 0) {
104       uint8_t *buf = buf_ls_.back();
105       aom_free(buf);
106       buf_ls_.pop_back();
107     }
108   }
add_input(int w,int h,int * stride)109   virtual uint8_t *add_input(int w, int h, int *stride) {
110     uint8_t *buf =
111         reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, kBufferSize));
112     buf_ls_.push_back(buf);
113     *stride = w + MAX_FILTER_TAP - 1;
114     int offset = MAX_FILTER_TAP / 2 - 1;
115     for (int r = 0; r < h + MAX_FILTER_TAP - 1; ++r) {
116       for (int c = 0; c < w + MAX_FILTER_TAP - 1; ++c) {
117         buf[r * (*stride) + c] = rnd_.Rand8();
118       }
119     }
120     return buf + offset * (*stride) + offset;
121   }
add_output(int w,int,int * stride)122   virtual uint8_t *add_output(int w, int /*h*/, int *stride) {
123     uint8_t *buf =
124         reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, kBufferSize));
125     buf_ls_.push_back(buf);
126     *stride = w;
127     return buf;
128   }
random_init_buf(uint8_t * buf,int w,int h,int stride)129   virtual void random_init_buf(uint8_t *buf, int w, int h, int stride) {
130     for (int r = 0; r < h; ++r) {
131       for (int c = 0; c < w; ++c) {
132         buf[r * stride + c] = rnd_.Rand8();
133       }
134     }
135   }
136 
137  protected:
138   static const int kDataAlignment = 16;
139   static const int kOuterBlockSize = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
140   static const int kBufferSize = kOuterBlockSize * kOuterBlockSize;
141   std::vector<uint8_t *> buf_ls_;
142   InterpFilter interp_filter_ls_[4];
143   ConvolveFunctions *cfs_;
144   ACMRandom rnd_;
145 };
146 
147 int bsize_ls[] = { 1, 2, 4, 8, 16, 32, 64, 3, 7, 15, 31, 63 };
148 int bsize_num = NELEMENTS(bsize_ls);
149 
TEST_P(Av1ConvolveTest,av1_convolve_vert)150 TEST_P(Av1ConvolveTest, av1_convolve_vert) {
151   const int y_step_q4 = 16;
152   ConvolveParams conv_params = get_conv_params(0, 0, 0);
153 
154   int in_stride, out_stride, ref_out_stride, avg_out_stride, ref_avg_out_stride;
155   uint8_t *in = add_input(MAX_SB_SIZE, MAX_SB_SIZE, &in_stride);
156   uint8_t *out = add_output(MAX_SB_SIZE, MAX_SB_SIZE, &out_stride);
157   uint8_t *ref_out = add_output(MAX_SB_SIZE, MAX_SB_SIZE, &ref_out_stride);
158   uint8_t *avg_out = add_output(MAX_SB_SIZE, MAX_SB_SIZE, &avg_out_stride);
159   uint8_t *ref_avg_out =
160       add_output(MAX_SB_SIZE, MAX_SB_SIZE, &ref_avg_out_stride);
161   for (int hb_idx = 0; hb_idx < bsize_num; ++hb_idx) {
162     for (int vb_idx = 0; vb_idx < bsize_num; ++vb_idx) {
163       int w = bsize_ls[hb_idx];
164       int h = bsize_ls[vb_idx];
165       for (int subpel_y_q4 = 0; subpel_y_q4 < SUBPEL_SHIFTS; ++subpel_y_q4) {
166         InterpFilter filter_y = interp_filter_ls_[0];
167         InterpFilterParams param_vert = av1_get_interp_filter_params(filter_y);
168         const int16_t *filter_vert =
169             av1_get_interp_filter_subpel_kernel(param_vert, subpel_y_q4);
170 
171         filter_block1d_vert_c(in, in_stride, filter_vert, param_vert.taps,
172                               ref_out, ref_out_stride, w, h);
173 
174         conv_params.ref = 0;
175         conv_params.do_average = 0;
176         cfs_->vf_(in, in_stride, out, out_stride, w, h, param_vert, subpel_y_q4,
177                   y_step_q4, &conv_params);
178         EXPECT_EQ(match(out, out_stride, ref_out, ref_out_stride, w, h), 1)
179             << " hb_idx " << hb_idx << " vb_idx " << vb_idx << " filter_y "
180             << filter_y << " subpel_y_q4 " << subpel_y_q4;
181 
182         random_init_buf(avg_out, w, h, avg_out_stride);
183         for (int r = 0; r < h; ++r) {
184           for (int c = 0; c < w; ++c) {
185             ref_avg_out[r * ref_avg_out_stride + c] = ROUND_POWER_OF_TWO(
186                 avg_out[r * avg_out_stride + c] + out[r * out_stride + c], 1);
187           }
188         }
189         conv_params.ref = 1;
190         conv_params.do_average = 1;
191         cfs_->vf_(in, in_stride, avg_out, avg_out_stride, w, h, param_vert,
192                   subpel_y_q4, y_step_q4, &conv_params);
193         EXPECT_EQ(match(avg_out, avg_out_stride, ref_avg_out,
194                         ref_avg_out_stride, w, h),
195                   1)
196             << " hb_idx " << hb_idx << " vb_idx " << vb_idx << " filter_y "
197             << filter_y << " subpel_y_q4 " << subpel_y_q4;
198       }
199     }
200   }
201 };
202 
TEST_P(Av1ConvolveTest,av1_convolve_horiz)203 TEST_P(Av1ConvolveTest, av1_convolve_horiz) {
204   const int x_step_q4 = 16;
205   ConvolveParams conv_params = get_conv_params(0, 0, 0);
206 
207   int in_stride, out_stride, ref_out_stride, avg_out_stride, ref_avg_out_stride;
208   uint8_t *in = add_input(MAX_SB_SIZE, MAX_SB_SIZE, &in_stride);
209   uint8_t *out = add_output(MAX_SB_SIZE, MAX_SB_SIZE, &out_stride);
210   uint8_t *ref_out = add_output(MAX_SB_SIZE, MAX_SB_SIZE, &ref_out_stride);
211   uint8_t *avg_out = add_output(MAX_SB_SIZE, MAX_SB_SIZE, &avg_out_stride);
212   uint8_t *ref_avg_out =
213       add_output(MAX_SB_SIZE, MAX_SB_SIZE, &ref_avg_out_stride);
214   for (int hb_idx = 0; hb_idx < bsize_num; ++hb_idx) {
215     for (int vb_idx = 0; vb_idx < bsize_num; ++vb_idx) {
216       int w = bsize_ls[hb_idx];
217       int h = bsize_ls[vb_idx];
218       for (int subpel_x_q4 = 0; subpel_x_q4 < SUBPEL_SHIFTS; ++subpel_x_q4) {
219         InterpFilter filter_x = interp_filter_ls_[1];
220         InterpFilterParams param_horiz = av1_get_interp_filter_params(filter_x);
221         const int16_t *filter_horiz =
222             av1_get_interp_filter_subpel_kernel(param_horiz, subpel_x_q4);
223 
224         filter_block1d_horiz_c(in, in_stride, filter_horiz, param_horiz.taps,
225                                ref_out, ref_out_stride, w, h);
226 
227         conv_params.ref = 0;
228         conv_params.do_average = 0;
229         cfs_->hf_(in, in_stride, out, out_stride, w, h, param_horiz,
230                   subpel_x_q4, x_step_q4, &conv_params);
231         EXPECT_EQ(match(out, out_stride, ref_out, ref_out_stride, w, h), 1)
232             << " hb_idx " << hb_idx << " vb_idx " << vb_idx << " filter_x "
233             << filter_x << " subpel_x_q4 " << subpel_x_q4;
234 
235         random_init_buf(avg_out, w, h, avg_out_stride);
236         for (int r = 0; r < h; ++r) {
237           for (int c = 0; c < w; ++c) {
238             ref_avg_out[r * ref_avg_out_stride + c] = ROUND_POWER_OF_TWO(
239                 avg_out[r * avg_out_stride + c] + out[r * out_stride + c], 1);
240           }
241         }
242         conv_params.ref = 1;
243         conv_params.do_average = 1;
244         cfs_->hf_(in, in_stride, avg_out, avg_out_stride, w, h, param_horiz,
245                   subpel_x_q4, x_step_q4, &conv_params);
246         EXPECT_EQ(match(avg_out, avg_out_stride, ref_avg_out,
247                         ref_avg_out_stride, w, h),
248                   1)
249             << "hb_idx " << hb_idx << "vb_idx" << vb_idx << " filter_x "
250             << filter_x << "subpel_x_q4 " << subpel_x_q4;
251       }
252     }
253   }
254 };
255 
256 ConvolveFunctions convolve_functions_c(av1_convolve_horiz_c,
257                                        av1_convolve_vert_c);
258 
259 InterpFilter filter_ls[] = { EIGHTTAP_REGULAR, EIGHTTAP_SMOOTH,
260                              MULTITAP_SHARP };
261 
262 INSTANTIATE_TEST_CASE_P(
263     C, Av1ConvolveTest,
264     ::testing::Combine(::testing::Values(&convolve_functions_c),
265                        ::testing::ValuesIn(filter_ls),
266                        ::testing::ValuesIn(filter_ls)));
267 
268 #if CONFIG_HIGHBITDEPTH
269 #ifndef __clang_analyzer__
TEST(AV1ConvolveTest,av1_highbd_convolve)270 TEST(AV1ConvolveTest, av1_highbd_convolve) {
271   ACMRandom rnd(ACMRandom::DeterministicSeed());
272   InterpFilters interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
273   InterpFilterParams filter_params =
274       av1_get_interp_filter_params(EIGHTTAP_REGULAR);
275   int filter_size = filter_params.taps;
276   int filter_center = filter_size / 2 - 1;
277   uint16_t src[12 * 12];
278   int src_stride = filter_size;
279   uint16_t dst[1] = { 0 };
280   int dst_stride = 1;
281   int x_step_q4 = 16;
282   int y_step_q4 = 16;
283   int avg = 0;
284   int bd = 10;
285   int w = 1;
286   int h = 1;
287 
288   int subpel_x_q4;
289   int subpel_y_q4;
290 
291   for (int i = 0; i < filter_size * filter_size; i++) {
292     src[i] = rnd.Rand16() % (1 << bd);
293   }
294 
295   for (subpel_x_q4 = 0; subpel_x_q4 < SUBPEL_SHIFTS; subpel_x_q4++) {
296     for (subpel_y_q4 = 0; subpel_y_q4 < SUBPEL_SHIFTS; subpel_y_q4++) {
297       av1_highbd_convolve(
298           CONVERT_TO_BYTEPTR(src + src_stride * filter_center + filter_center),
299           src_stride, CONVERT_TO_BYTEPTR(dst), dst_stride, w, h, interp_filters,
300           subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, avg, bd);
301 
302       const int16_t *x_filter =
303           av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
304       const int16_t *y_filter =
305           av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
306 
307       int temp[12];
308       int dst_ref = 0;
309       for (int r = 0; r < filter_size; r++) {
310         temp[r] = 0;
311         for (int c = 0; c < filter_size; c++) {
312           temp[r] += x_filter[c] * src[r * filter_size + c];
313         }
314         temp[r] =
315             clip_pixel_highbd(ROUND_POWER_OF_TWO(temp[r], FILTER_BITS), bd);
316         dst_ref += temp[r] * y_filter[r];
317       }
318       dst_ref = clip_pixel_highbd(ROUND_POWER_OF_TWO(dst_ref, FILTER_BITS), bd);
319       EXPECT_EQ(dst[0], dst_ref);
320     }
321   }
322 }
323 #endif
324 
TEST(AV1ConvolveTest,av1_highbd_convolve_avg)325 TEST(AV1ConvolveTest, av1_highbd_convolve_avg) {
326   ACMRandom rnd(ACMRandom::DeterministicSeed());
327   InterpFilters interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
328   InterpFilterParams filter_params =
329       av1_get_interp_filter_params(EIGHTTAP_REGULAR);
330   int filter_size = filter_params.taps;
331   int filter_center = filter_size / 2 - 1;
332   uint16_t src0[12 * 12];
333   uint16_t src1[12 * 12];
334   int src_stride = filter_size;
335   uint16_t dst0[1] = { 0 };
336   uint16_t dst1[1] = { 0 };
337   uint16_t dst[1] = { 0 };
338   int dst_stride = 1;
339   int x_step_q4 = 16;
340   int y_step_q4 = 16;
341   int avg = 0;
342   int bd = 10;
343 
344   int w = 1;
345   int h = 1;
346 
347   int subpel_x_q4;
348   int subpel_y_q4;
349 
350   for (int i = 0; i < filter_size * filter_size; i++) {
351     src0[i] = rnd.Rand16() % (1 << bd);
352     src1[i] = rnd.Rand16() % (1 << bd);
353   }
354 
355   for (subpel_x_q4 = 0; subpel_x_q4 < SUBPEL_SHIFTS; subpel_x_q4++) {
356     for (subpel_y_q4 = 0; subpel_y_q4 < SUBPEL_SHIFTS; subpel_y_q4++) {
357       int offset = filter_size * filter_center + filter_center;
358 
359       avg = 0;
360       av1_highbd_convolve(CONVERT_TO_BYTEPTR(src0 + offset), src_stride,
361                           CONVERT_TO_BYTEPTR(dst0), dst_stride, w, h,
362                           interp_filters, subpel_x_q4, x_step_q4, subpel_y_q4,
363                           y_step_q4, avg, bd);
364       avg = 0;
365       av1_highbd_convolve(CONVERT_TO_BYTEPTR(src1 + offset), src_stride,
366                           CONVERT_TO_BYTEPTR(dst1), dst_stride, w, h,
367                           interp_filters, subpel_x_q4, x_step_q4, subpel_y_q4,
368                           y_step_q4, avg, bd);
369 
370       avg = 0;
371       av1_highbd_convolve(CONVERT_TO_BYTEPTR(src0 + offset), src_stride,
372                           CONVERT_TO_BYTEPTR(dst), dst_stride, w, h,
373                           interp_filters, subpel_x_q4, x_step_q4, subpel_y_q4,
374                           y_step_q4, avg, bd);
375       avg = 1;
376       av1_highbd_convolve(CONVERT_TO_BYTEPTR(src1 + offset), src_stride,
377                           CONVERT_TO_BYTEPTR(dst), dst_stride, w, h,
378                           interp_filters, subpel_x_q4, x_step_q4, subpel_y_q4,
379                           y_step_q4, avg, bd);
380 
381       EXPECT_EQ(dst[0], ROUND_POWER_OF_TWO(dst0[0] + dst1[0], 1));
382     }
383   }
384 }
385 #endif  // CONFIG_HIGHBITDEPTH
386 
387 #define CONVOLVE_SPEED_TEST 0
388 #if CONVOLVE_SPEED_TEST
389 #define highbd_convolve_speed(func, block_size, frame_size)                  \
390   TEST(AV1ConvolveTest, func##_speed_##block_size##_##frame_size) {          \
391     ACMRandom rnd(ACMRandom::DeterministicSeed());                           \
392     InterpFilter interp_filter = EIGHTTAP;                                   \
393     InterpFilterParams filter_params =                                       \
394         av1_get_interp_filter_params(interp_filter);                         \
395     int filter_size = filter_params.tap;                                     \
396     int filter_center = filter_size / 2 - 1;                                 \
397     DECLARE_ALIGNED(16, uint16_t,                                            \
398                     src[(frame_size + 7) * (frame_size + 7)]) = { 0 };       \
399     int src_stride = frame_size + 7;                                         \
400     DECLARE_ALIGNED(16, uint16_t, dst[frame_size * frame_size]) = { 0 };     \
401     int dst_stride = frame_size;                                             \
402     int x_step_q4 = 16;                                                      \
403     int y_step_q4 = 16;                                                      \
404     int subpel_x_q4 = 8;                                                     \
405     int subpel_y_q4 = 6;                                                     \
406     int bd = 10;                                                             \
407                                                                              \
408     int w = block_size;                                                      \
409     int h = block_size;                                                      \
410                                                                              \
411     const int16_t *filter_x =                                                \
412         av1_get_interp_filter_kernel(filter_params, subpel_x_q4);            \
413     const int16_t *filter_y =                                                \
414         av1_get_interp_filter_kernel(filter_params, subpel_y_q4);            \
415                                                                              \
416     for (int i = 0; i < src_stride * src_stride; i++) {                      \
417       src[i] = rnd.Rand16() % (1 << bd);                                     \
418     }                                                                        \
419                                                                              \
420     int offset = filter_center * src_stride + filter_center;                 \
421     int row_offset = 0;                                                      \
422     int col_offset = 0;                                                      \
423     for (int i = 0; i < 100000; i++) {                                       \
424       int src_total_offset = offset + col_offset * src_stride + row_offset;  \
425       int dst_total_offset = col_offset * dst_stride + row_offset;           \
426       func(CONVERT_TO_BYTEPTR(src + src_total_offset), src_stride,           \
427            CONVERT_TO_BYTEPTR(dst + dst_total_offset), dst_stride, filter_x, \
428            x_step_q4, filter_y, y_step_q4, w, h, bd);                        \
429       if (offset + w + w < frame_size) {                                     \
430         row_offset += w;                                                     \
431       } else {                                                               \
432         row_offset = 0;                                                      \
433         col_offset += h;                                                     \
434       }                                                                      \
435       if (col_offset + h >= frame_size) {                                    \
436         col_offset = 0;                                                      \
437       }                                                                      \
438     }                                                                        \
439   }
440 
441 #define lowbd_convolve_speed(func, block_size, frame_size)                  \
442   TEST(AV1ConvolveTest, func##_speed_l_##block_size##_##frame_size) {       \
443     ACMRandom rnd(ACMRandom::DeterministicSeed());                          \
444     InterpFilter interp_filter = EIGHTTAP;                                  \
445     InterpFilterParams filter_params =                                      \
446         av1_get_interp_filter_params(interp_filter);                        \
447     int filter_size = filter_params.tap;                                    \
448     int filter_center = filter_size / 2 - 1;                                \
449     DECLARE_ALIGNED(16, uint8_t, src[(frame_size + 7) * (frame_size + 7)]); \
450     int src_stride = frame_size + 7;                                        \
451     DECLARE_ALIGNED(16, uint8_t, dst[frame_size * frame_size]);             \
452     int dst_stride = frame_size;                                            \
453     int x_step_q4 = 16;                                                     \
454     int y_step_q4 = 16;                                                     \
455     int subpel_x_q4 = 8;                                                    \
456     int subpel_y_q4 = 6;                                                    \
457     int bd = 8;                                                             \
458                                                                             \
459     int w = block_size;                                                     \
460     int h = block_size;                                                     \
461                                                                             \
462     const int16_t *filter_x =                                               \
463         av1_get_interp_filter_kernel(filter_params, subpel_x_q4);           \
464     const int16_t *filter_y =                                               \
465         av1_get_interp_filter_kernel(filter_params, subpel_y_q4);           \
466                                                                             \
467     for (int i = 0; i < src_stride * src_stride; i++) {                     \
468       src[i] = rnd.Rand16() % (1 << bd);                                    \
469     }                                                                       \
470                                                                             \
471     int offset = filter_center * src_stride + filter_center;                \
472     int row_offset = 0;                                                     \
473     int col_offset = 0;                                                     \
474     for (int i = 0; i < 100000; i++) {                                      \
475       func(src + offset, src_stride, dst, dst_stride, filter_x, x_step_q4,  \
476            filter_y, y_step_q4, w, h);                                      \
477       if (offset + w + w < frame_size) {                                    \
478         row_offset += w;                                                    \
479       } else {                                                              \
480         row_offset = 0;                                                     \
481         col_offset += h;                                                    \
482       }                                                                     \
483       if (col_offset + h >= frame_size) {                                   \
484         col_offset = 0;                                                     \
485       }                                                                     \
486     }                                                                       \
487   }
488 
489 // This experiment shows that when frame size is 64x64
490 // aom_highbd_convolve8_sse2 and aom_convolve8_sse2's speed are similar.
491 // However when frame size becomes 1024x1024
492 // aom_highbd_convolve8_sse2 is around 50% slower than aom_convolve8_sse2
493 // we think the bottleneck is from memory IO
494 highbd_convolve_speed(aom_highbd_convolve8_sse2, 8, 64);
495 highbd_convolve_speed(aom_highbd_convolve8_sse2, 16, 64);
496 highbd_convolve_speed(aom_highbd_convolve8_sse2, 32, 64);
497 highbd_convolve_speed(aom_highbd_convolve8_sse2, 64, 64);
498 
499 lowbd_convolve_speed(aom_convolve8_sse2, 8, 64);
500 lowbd_convolve_speed(aom_convolve8_sse2, 16, 64);
501 lowbd_convolve_speed(aom_convolve8_sse2, 32, 64);
502 lowbd_convolve_speed(aom_convolve8_sse2, 64, 64);
503 
504 highbd_convolve_speed(aom_highbd_convolve8_sse2, 8, 1024);
505 highbd_convolve_speed(aom_highbd_convolve8_sse2, 16, 1024);
506 highbd_convolve_speed(aom_highbd_convolve8_sse2, 32, 1024);
507 highbd_convolve_speed(aom_highbd_convolve8_sse2, 64, 1024);
508 
509 lowbd_convolve_speed(aom_convolve8_sse2, 8, 1024);
510 lowbd_convolve_speed(aom_convolve8_sse2, 16, 1024);
511 lowbd_convolve_speed(aom_convolve8_sse2, 32, 1024);
512 lowbd_convolve_speed(aom_convolve8_sse2, 64, 1024);
513 #endif  // CONVOLVE_SPEED_TEST
514 }  // namespace
515