1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/convolve.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_ENABLE_NEON
19 
20 #include <arm_neon.h>
21 
22 #include <algorithm>
23 #include <cassert>
24 #include <cstddef>
25 #include <cstdint>
26 
27 #include "src/dsp/arm/common_neon.h"
28 #include "src/dsp/constants.h"
29 #include "src/dsp/dsp.h"
30 #include "src/utils/common.h"
31 #include "src/utils/compiler_attributes.h"
32 
33 namespace libgav1 {
34 namespace dsp {
35 namespace low_bitdepth {
36 namespace {
37 
38 constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels;
39 constexpr int kHorizontalOffset = 3;
40 constexpr int kFilterIndexShift = 6;
41 
42 // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
43 // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
44 // sum from outranging int16_t.
45 template <int filter_index, bool negative_outside_taps = false>
SumOnePassTaps(const uint8x8_t * const src,const uint8x8_t * const taps)46 int16x8_t SumOnePassTaps(const uint8x8_t* const src,
47                          const uint8x8_t* const taps) {
48   uint16x8_t sum;
49   if (filter_index == 0) {
50     // 6 taps. + - + + - +
51     sum = vmull_u8(src[0], taps[0]);
52     // Unsigned overflow will result in a valid int16_t value.
53     sum = vmlsl_u8(sum, src[1], taps[1]);
54     sum = vmlal_u8(sum, src[2], taps[2]);
55     sum = vmlal_u8(sum, src[3], taps[3]);
56     sum = vmlsl_u8(sum, src[4], taps[4]);
57     sum = vmlal_u8(sum, src[5], taps[5]);
58   } else if (filter_index == 1 && negative_outside_taps) {
59     // 6 taps. - + + + + -
60     // Set a base we can subtract from.
61     sum = vmull_u8(src[1], taps[1]);
62     sum = vmlsl_u8(sum, src[0], taps[0]);
63     sum = vmlal_u8(sum, src[2], taps[2]);
64     sum = vmlal_u8(sum, src[3], taps[3]);
65     sum = vmlal_u8(sum, src[4], taps[4]);
66     sum = vmlsl_u8(sum, src[5], taps[5]);
67   } else if (filter_index == 1) {
68     // 6 taps. All are positive.
69     sum = vmull_u8(src[0], taps[0]);
70     sum = vmlal_u8(sum, src[1], taps[1]);
71     sum = vmlal_u8(sum, src[2], taps[2]);
72     sum = vmlal_u8(sum, src[3], taps[3]);
73     sum = vmlal_u8(sum, src[4], taps[4]);
74     sum = vmlal_u8(sum, src[5], taps[5]);
75   } else if (filter_index == 2) {
76     // 8 taps. - + - + + - + -
77     sum = vmull_u8(src[1], taps[1]);
78     sum = vmlsl_u8(sum, src[0], taps[0]);
79     sum = vmlsl_u8(sum, src[2], taps[2]);
80     sum = vmlal_u8(sum, src[3], taps[3]);
81     sum = vmlal_u8(sum, src[4], taps[4]);
82     sum = vmlsl_u8(sum, src[5], taps[5]);
83     sum = vmlal_u8(sum, src[6], taps[6]);
84     sum = vmlsl_u8(sum, src[7], taps[7]);
85   } else if (filter_index == 3) {
86     // 2 taps. All are positive.
87     sum = vmull_u8(src[0], taps[0]);
88     sum = vmlal_u8(sum, src[1], taps[1]);
89   } else if (filter_index == 4) {
90     // 4 taps. - + + -
91     sum = vmull_u8(src[1], taps[1]);
92     sum = vmlsl_u8(sum, src[0], taps[0]);
93     sum = vmlal_u8(sum, src[2], taps[2]);
94     sum = vmlsl_u8(sum, src[3], taps[3]);
95   } else if (filter_index == 5) {
96     // 4 taps. All are positive.
97     sum = vmull_u8(src[0], taps[0]);
98     sum = vmlal_u8(sum, src[1], taps[1]);
99     sum = vmlal_u8(sum, src[2], taps[2]);
100     sum = vmlal_u8(sum, src[3], taps[3]);
101   }
102   return vreinterpretq_s16_u16(sum);
103 }
104 
105 template <int filter_index, bool negative_outside_taps>
SumHorizontalTaps(const uint8_t * const src,const uint8x8_t * const v_tap)106 int16x8_t SumHorizontalTaps(const uint8_t* const src,
107                             const uint8x8_t* const v_tap) {
108   uint8x8_t v_src[8];
109   const uint8x16_t src_long = vld1q_u8(src);
110   int16x8_t sum;
111 
112   if (filter_index < 2) {
113     v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 1));
114     v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 2));
115     v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 3));
116     v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 4));
117     v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 5));
118     v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 6));
119     sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 1);
120   } else if (filter_index == 2) {
121     v_src[0] = vget_low_u8(src_long);
122     v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
123     v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
124     v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
125     v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
126     v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
127     v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
128     v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
129     sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap);
130   } else if (filter_index == 3) {
131     v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 3));
132     v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 4));
133     sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 3);
134   } else if (filter_index > 3) {
135     v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 2));
136     v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 3));
137     v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 4));
138     v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 5));
139     sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 2);
140   }
141   return sum;
142 }
143 
144 template <int filter_index, bool negative_outside_taps>
SimpleHorizontalTaps(const uint8_t * const src,const uint8x8_t * const v_tap)145 uint8x8_t SimpleHorizontalTaps(const uint8_t* const src,
146                                const uint8x8_t* const v_tap) {
147   int16x8_t sum =
148       SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
149 
150   // Normally the Horizontal pass does the downshift in two passes:
151   // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
152   // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
153   // requires adding the rounding offset from the skipped shift.
154   constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
155 
156   sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
157   return vqrshrun_n_s16(sum, kFilterBits - 1);
158 }
159 
160 template <int filter_index, bool negative_outside_taps>
HorizontalTaps8To16(const uint8_t * const src,const uint8x8_t * const v_tap)161 uint16x8_t HorizontalTaps8To16(const uint8_t* const src,
162                                const uint8x8_t* const v_tap) {
163   const int16x8_t sum =
164       SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
165 
166   return vreinterpretq_u16_s16(
167       vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
168 }
169 
170 template <int filter_index>
SumHorizontalTaps2x2(const uint8_t * src,const ptrdiff_t src_stride,const uint8x8_t * const v_tap)171 int16x8_t SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
172                                const uint8x8_t* const v_tap) {
173   uint16x8_t sum;
174   const uint8x8_t input0 = vld1_u8(src);
175   src += src_stride;
176   const uint8x8_t input1 = vld1_u8(src);
177   uint8x8x2_t input = vzip_u8(input0, input1);
178 
179   if (filter_index == 3) {
180     // tap signs : + +
181     sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
182     sum = vmlal_u8(sum, input.val[1], v_tap[4]);
183   } else if (filter_index == 4) {
184     // tap signs : - + + -
185     sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
186     sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
187     sum = vmlal_u8(sum, input.val[1], v_tap[4]);
188     sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
189   } else {
190     // tap signs : + + + +
191     sum = vmull_u8(RightShift<4 * 8>(input.val[0]), v_tap[2]);
192     sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
193     sum = vmlal_u8(sum, input.val[1], v_tap[4]);
194     sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
195   }
196 
197   return vreinterpretq_s16_u16(sum);
198 }
199 
200 template <int filter_index>
SimpleHorizontalTaps2x2(const uint8_t * src,const ptrdiff_t src_stride,const uint8x8_t * const v_tap)201 uint8x8_t SimpleHorizontalTaps2x2(const uint8_t* src,
202                                   const ptrdiff_t src_stride,
203                                   const uint8x8_t* const v_tap) {
204   int16x8_t sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
205 
206   // Normally the Horizontal pass does the downshift in two passes:
207   // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
208   // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
209   // requires adding the rounding offset from the skipped shift.
210   constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
211 
212   sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
213   return vqrshrun_n_s16(sum, kFilterBits - 1);
214 }
215 
216 template <int filter_index>
HorizontalTaps8To16_2x2(const uint8_t * src,const ptrdiff_t src_stride,const uint8x8_t * const v_tap)217 uint16x8_t HorizontalTaps8To16_2x2(const uint8_t* src,
218                                    const ptrdiff_t src_stride,
219                                    const uint8x8_t* const v_tap) {
220   const int16x8_t sum =
221       SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
222 
223   return vreinterpretq_u16_s16(
224       vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
225 }
226 
227 template <int num_taps, int step, int filter_index,
228           bool negative_outside_taps = true, bool is_2d = false,
229           bool is_compound = false>
FilterHorizontal(const uint8_t * src,const ptrdiff_t src_stride,void * const dest,const ptrdiff_t pred_stride,const int width,const int height,const uint8x8_t * const v_tap)230 void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
231                       void* const dest, const ptrdiff_t pred_stride,
232                       const int width, const int height,
233                       const uint8x8_t* const v_tap) {
234   auto* dest8 = static_cast<uint8_t*>(dest);
235   auto* dest16 = static_cast<uint16_t*>(dest);
236 
237   // 4 tap filters are never used when width > 4.
238   if (num_taps != 4 && width > 4) {
239     int y = 0;
240     do {
241       int x = 0;
242       do {
243         if (is_2d || is_compound) {
244           const uint16x8_t v_sum =
245               HorizontalTaps8To16<filter_index, negative_outside_taps>(&src[x],
246                                                                        v_tap);
247           vst1q_u16(&dest16[x], v_sum);
248         } else {
249           const uint8x8_t result =
250               SimpleHorizontalTaps<filter_index, negative_outside_taps>(&src[x],
251                                                                         v_tap);
252           vst1_u8(&dest8[x], result);
253         }
254         x += step;
255       } while (x < width);
256       src += src_stride;
257       dest8 += pred_stride;
258       dest16 += pred_stride;
259     } while (++y < height);
260     return;
261   }
262 
263   // Horizontal passes only needs to account for |num_taps| 2 and 4 when
264   // |width| <= 4.
265   assert(width <= 4);
266   assert(num_taps <= 4);
267   if (num_taps <= 4) {
268     if (width == 4) {
269       int y = 0;
270       do {
271         if (is_2d || is_compound) {
272           const uint16x8_t v_sum =
273               HorizontalTaps8To16<filter_index, negative_outside_taps>(src,
274                                                                        v_tap);
275           vst1_u16(dest16, vget_low_u16(v_sum));
276         } else {
277           const uint8x8_t result =
278               SimpleHorizontalTaps<filter_index, negative_outside_taps>(src,
279                                                                         v_tap);
280           StoreLo4(&dest8[0], result);
281         }
282         src += src_stride;
283         dest8 += pred_stride;
284         dest16 += pred_stride;
285       } while (++y < height);
286       return;
287     }
288 
289     if (!is_compound) {
290       int y = 0;
291       do {
292         if (is_2d) {
293           const uint16x8_t sum =
294               HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
295           dest16[0] = vgetq_lane_u16(sum, 0);
296           dest16[1] = vgetq_lane_u16(sum, 2);
297           dest16 += pred_stride;
298           dest16[0] = vgetq_lane_u16(sum, 1);
299           dest16[1] = vgetq_lane_u16(sum, 3);
300           dest16 += pred_stride;
301         } else {
302           const uint8x8_t sum =
303               SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
304 
305           dest8[0] = vget_lane_u8(sum, 0);
306           dest8[1] = vget_lane_u8(sum, 2);
307           dest8 += pred_stride;
308 
309           dest8[0] = vget_lane_u8(sum, 1);
310           dest8[1] = vget_lane_u8(sum, 3);
311           dest8 += pred_stride;
312         }
313 
314         src += src_stride << 1;
315         y += 2;
316       } while (y < height - 1);
317 
318       // The 2d filters have an odd |height| because the horizontal pass
319       // generates context for the vertical pass.
320       if (is_2d) {
321         assert(height % 2 == 1);
322         uint16x8_t sum;
323         const uint8x8_t input = vld1_u8(src);
324         if (filter_index == 3) {  // |num_taps| == 2
325           sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
326           sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
327         } else if (filter_index == 4) {
328           sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
329           sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
330           sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
331           sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
332         } else {
333           assert(filter_index == 5);
334           sum = vmull_u8(RightShift<2 * 8>(input), v_tap[2]);
335           sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
336           sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
337           sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
338         }
339         // |sum| contains an int16_t value.
340         sum = vreinterpretq_u16_s16(vrshrq_n_s16(
341             vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1));
342         Store2<0>(dest16, sum);
343       }
344     }
345   }
346 }
347 
348 // Process 16 bit inputs and output 32 bits.
349 template <int num_taps, bool is_compound>
Sum2DVerticalTaps4(const int16x4_t * const src,const int16x8_t taps)350 inline int16x4_t Sum2DVerticalTaps4(const int16x4_t* const src,
351                                     const int16x8_t taps) {
352   const int16x4_t taps_lo = vget_low_s16(taps);
353   const int16x4_t taps_hi = vget_high_s16(taps);
354   int32x4_t sum;
355   if (num_taps == 8) {
356     sum = vmull_lane_s16(src[0], taps_lo, 0);
357     sum = vmlal_lane_s16(sum, src[1], taps_lo, 1);
358     sum = vmlal_lane_s16(sum, src[2], taps_lo, 2);
359     sum = vmlal_lane_s16(sum, src[3], taps_lo, 3);
360     sum = vmlal_lane_s16(sum, src[4], taps_hi, 0);
361     sum = vmlal_lane_s16(sum, src[5], taps_hi, 1);
362     sum = vmlal_lane_s16(sum, src[6], taps_hi, 2);
363     sum = vmlal_lane_s16(sum, src[7], taps_hi, 3);
364   } else if (num_taps == 6) {
365     sum = vmull_lane_s16(src[0], taps_lo, 1);
366     sum = vmlal_lane_s16(sum, src[1], taps_lo, 2);
367     sum = vmlal_lane_s16(sum, src[2], taps_lo, 3);
368     sum = vmlal_lane_s16(sum, src[3], taps_hi, 0);
369     sum = vmlal_lane_s16(sum, src[4], taps_hi, 1);
370     sum = vmlal_lane_s16(sum, src[5], taps_hi, 2);
371   } else if (num_taps == 4) {
372     sum = vmull_lane_s16(src[0], taps_lo, 2);
373     sum = vmlal_lane_s16(sum, src[1], taps_lo, 3);
374     sum = vmlal_lane_s16(sum, src[2], taps_hi, 0);
375     sum = vmlal_lane_s16(sum, src[3], taps_hi, 1);
376   } else if (num_taps == 2) {
377     sum = vmull_lane_s16(src[0], taps_lo, 3);
378     sum = vmlal_lane_s16(sum, src[1], taps_hi, 0);
379   }
380 
381   if (is_compound) {
382     return vqrshrn_n_s32(sum, kInterRoundBitsCompoundVertical - 1);
383   }
384 
385   return vqrshrn_n_s32(sum, kInterRoundBitsVertical - 1);
386 }
387 
388 template <int num_taps, bool is_compound>
SimpleSum2DVerticalTaps(const int16x8_t * const src,const int16x8_t taps)389 int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
390                                   const int16x8_t taps) {
391   const int16x4_t taps_lo = vget_low_s16(taps);
392   const int16x4_t taps_hi = vget_high_s16(taps);
393   int32x4_t sum_lo, sum_hi;
394   if (num_taps == 8) {
395     sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0);
396     sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 0);
397     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1);
398     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1);
399     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2);
400     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 2);
401     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_lo, 3);
402     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_lo, 3);
403 
404     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 0);
405     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 0);
406     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 1);
407     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 1);
408     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[6]), taps_hi, 2);
409     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[6]), taps_hi, 2);
410     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3);
411     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3);
412   } else if (num_taps == 6) {
413     sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1);
414     sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 1);
415     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2);
416     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2);
417     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3);
418     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 3);
419 
420     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 0);
421     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 0);
422     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 1);
423     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 1);
424     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2);
425     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2);
426   } else if (num_taps == 4) {
427     sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 2);
428     sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 2);
429     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3);
430     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3);
431 
432     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_hi, 0);
433     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_hi, 0);
434     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1);
435     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1);
436   } else if (num_taps == 2) {
437     sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 3);
438     sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 3);
439 
440     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0);
441     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0);
442   }
443 
444   if (is_compound) {
445     return vcombine_s16(
446         vqrshrn_n_s32(sum_lo, kInterRoundBitsCompoundVertical - 1),
447         vqrshrn_n_s32(sum_hi, kInterRoundBitsCompoundVertical - 1));
448   }
449 
450   return vcombine_s16(vqrshrn_n_s32(sum_lo, kInterRoundBitsVertical - 1),
451                       vqrshrn_n_s32(sum_hi, kInterRoundBitsVertical - 1));
452 }
453 
454 template <int num_taps, bool is_compound = false>
Filter2DVertical(const uint16_t * src,void * const dst,const ptrdiff_t dst_stride,const int width,const int height,const int16x8_t taps)455 void Filter2DVertical(const uint16_t* src, void* const dst,
456                       const ptrdiff_t dst_stride, const int width,
457                       const int height, const int16x8_t taps) {
458   assert(width >= 8);
459   constexpr int next_row = num_taps - 1;
460   // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
461   const ptrdiff_t src_stride = width;
462 
463   auto* dst8 = static_cast<uint8_t*>(dst);
464   auto* dst16 = static_cast<uint16_t*>(dst);
465 
466   int x = 0;
467   do {
468     int16x8_t srcs[8];
469     const uint16_t* src_x = src + x;
470     srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src_x));
471     src_x += src_stride;
472     if (num_taps >= 4) {
473       srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src_x));
474       src_x += src_stride;
475       srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src_x));
476       src_x += src_stride;
477       if (num_taps >= 6) {
478         srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src_x));
479         src_x += src_stride;
480         srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src_x));
481         src_x += src_stride;
482         if (num_taps == 8) {
483           srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src_x));
484           src_x += src_stride;
485           srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src_x));
486           src_x += src_stride;
487         }
488       }
489     }
490 
491     int y = 0;
492     do {
493       srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src_x));
494       src_x += src_stride;
495 
496       const int16x8_t sum =
497           SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
498       if (is_compound) {
499         vst1q_u16(dst16 + x + y * dst_stride, vreinterpretq_u16_s16(sum));
500       } else {
501         vst1_u8(dst8 + x + y * dst_stride, vqmovun_s16(sum));
502       }
503 
504       srcs[0] = srcs[1];
505       if (num_taps >= 4) {
506         srcs[1] = srcs[2];
507         srcs[2] = srcs[3];
508         if (num_taps >= 6) {
509           srcs[3] = srcs[4];
510           srcs[4] = srcs[5];
511           if (num_taps == 8) {
512             srcs[5] = srcs[6];
513             srcs[6] = srcs[7];
514           }
515         }
516       }
517     } while (++y < height);
518     x += 8;
519   } while (x < width);
520 }
521 
522 // Take advantage of |src_stride| == |width| to process two rows at a time.
523 template <int num_taps, bool is_compound = false>
Filter2DVertical4xH(const uint16_t * src,void * const dst,const ptrdiff_t dst_stride,const int height,const int16x8_t taps)524 void Filter2DVertical4xH(const uint16_t* src, void* const dst,
525                          const ptrdiff_t dst_stride, const int height,
526                          const int16x8_t taps) {
527   auto* dst8 = static_cast<uint8_t*>(dst);
528   auto* dst16 = static_cast<uint16_t*>(dst);
529 
530   int16x8_t srcs[9];
531   srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
532   src += 8;
533   if (num_taps >= 4) {
534     srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
535     src += 8;
536     srcs[1] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[2]));
537     if (num_taps >= 6) {
538       srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
539       src += 8;
540       srcs[3] = vcombine_s16(vget_high_s16(srcs[2]), vget_low_s16(srcs[4]));
541       if (num_taps == 8) {
542         srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
543         src += 8;
544         srcs[5] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[6]));
545       }
546     }
547   }
548 
549   int y = 0;
550   do {
551     srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
552     src += 8;
553     srcs[num_taps - 1] = vcombine_s16(vget_high_s16(srcs[num_taps - 2]),
554                                       vget_low_s16(srcs[num_taps]));
555 
556     const int16x8_t sum =
557         SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
558     if (is_compound) {
559       const uint16x8_t results = vreinterpretq_u16_s16(sum);
560       vst1q_u16(dst16, results);
561       dst16 += 4 << 1;
562     } else {
563       const uint8x8_t results = vqmovun_s16(sum);
564 
565       StoreLo4(dst8, results);
566       dst8 += dst_stride;
567       StoreHi4(dst8, results);
568       dst8 += dst_stride;
569     }
570 
571     srcs[0] = srcs[2];
572     if (num_taps >= 4) {
573       srcs[1] = srcs[3];
574       srcs[2] = srcs[4];
575       if (num_taps >= 6) {
576         srcs[3] = srcs[5];
577         srcs[4] = srcs[6];
578         if (num_taps == 8) {
579           srcs[5] = srcs[7];
580           srcs[6] = srcs[8];
581         }
582       }
583     }
584     y += 2;
585   } while (y < height);
586 }
587 
588 // Take advantage of |src_stride| == |width| to process four rows at a time.
589 template <int num_taps>
Filter2DVertical2xH(const uint16_t * src,void * const dst,const ptrdiff_t dst_stride,const int height,const int16x8_t taps)590 void Filter2DVertical2xH(const uint16_t* src, void* const dst,
591                          const ptrdiff_t dst_stride, const int height,
592                          const int16x8_t taps) {
593   constexpr int next_row = (num_taps < 6) ? 4 : 8;
594 
595   auto* dst8 = static_cast<uint8_t*>(dst);
596 
597   int16x8_t srcs[9];
598   srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
599   src += 8;
600   if (num_taps >= 6) {
601     srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
602     src += 8;
603     srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
604     if (num_taps == 8) {
605       srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
606       srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
607     }
608   }
609 
610   int y = 0;
611   do {
612     srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
613     src += 8;
614     if (num_taps == 2) {
615       srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
616     } else if (num_taps == 4) {
617       srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
618       srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
619       srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
620     } else if (num_taps == 6) {
621       srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
622       srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
623       srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
624     } else if (num_taps == 8) {
625       srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
626       srcs[6] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[8]));
627       srcs[7] = vextq_s16(srcs[4], srcs[8], 6);
628     }
629 
630     const int16x8_t sum =
631         SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
632     const uint8x8_t results = vqmovun_s16(sum);
633 
634     Store2<0>(dst8, results);
635     dst8 += dst_stride;
636     Store2<1>(dst8, results);
637     // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
638     // Therefore we don't need to check this condition when |height| > 4.
639     if (num_taps <= 4 && height == 2) return;
640     dst8 += dst_stride;
641     Store2<2>(dst8, results);
642     dst8 += dst_stride;
643     Store2<3>(dst8, results);
644     dst8 += dst_stride;
645 
646     srcs[0] = srcs[4];
647     if (num_taps == 6) {
648       srcs[1] = srcs[5];
649       srcs[4] = srcs[8];
650     } else if (num_taps == 8) {
651       srcs[1] = srcs[5];
652       srcs[2] = srcs[6];
653       srcs[3] = srcs[7];
654       srcs[4] = srcs[8];
655     }
656 
657     y += 4;
658   } while (y < height);
659 }
660 
661 template <bool is_2d = false, bool is_compound = false>
DoHorizontalPass(const uint8_t * const src,const ptrdiff_t src_stride,void * const dst,const ptrdiff_t dst_stride,const int width,const int height,const int filter_id,const int filter_index)662 LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
663     const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
664     const ptrdiff_t dst_stride, const int width, const int height,
665     const int filter_id, const int filter_index) {
666   // Duplicate the absolute value for each tap.  Negative taps are corrected
667   // by using the vmlsl_u8 instruction.  Positive taps use vmlal_u8.
668   uint8x8_t v_tap[kSubPixelTaps];
669   assert(filter_id != 0);
670 
671   for (int k = 0; k < kSubPixelTaps; ++k) {
672     v_tap[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
673   }
674 
675   if (filter_index == 2) {  // 8 tap.
676     FilterHorizontal<8, 8, 2, true, is_2d, is_compound>(
677         src, src_stride, dst, dst_stride, width, height, v_tap);
678   } else if (filter_index == 1) {  // 6 tap.
679     // Check if outside taps are positive.
680     if ((filter_id == 1) | (filter_id == 15)) {
681       FilterHorizontal<6, 8, 1, false, is_2d, is_compound>(
682           src, src_stride, dst, dst_stride, width, height, v_tap);
683     } else {
684       FilterHorizontal<6, 8, 1, true, is_2d, is_compound>(
685           src, src_stride, dst, dst_stride, width, height, v_tap);
686     }
687   } else if (filter_index == 0) {  // 6 tap.
688     FilterHorizontal<6, 8, 0, true, is_2d, is_compound>(
689         src, src_stride, dst, dst_stride, width, height, v_tap);
690   } else if (filter_index == 4) {  // 4 tap.
691     FilterHorizontal<4, 8, 4, true, is_2d, is_compound>(
692         src, src_stride, dst, dst_stride, width, height, v_tap);
693   } else if (filter_index == 5) {  // 4 tap.
694     FilterHorizontal<4, 8, 5, true, is_2d, is_compound>(
695         src, src_stride, dst, dst_stride, width, height, v_tap);
696   } else {  // 2 tap.
697     FilterHorizontal<2, 8, 3, true, is_2d, is_compound>(
698         src, src_stride, dst, dst_stride, width, height, v_tap);
699   }
700 }
701 
GetNumTapsInFilter(const int filter_index)702 int GetNumTapsInFilter(const int filter_index) {
703   if (filter_index < 2) {
704     // Despite the names these only use 6 taps.
705     // kInterpolationFilterEightTap
706     // kInterpolationFilterEightTapSmooth
707     return 6;
708   }
709 
710   if (filter_index == 2) {
711     // kInterpolationFilterEightTapSharp
712     return 8;
713   }
714 
715   if (filter_index == 3) {
716     // kInterpolationFilterBilinear
717     return 2;
718   }
719 
720   assert(filter_index > 3);
721   // For small sizes (width/height <= 4) the large filters are replaced with 4
722   // tap options.
723   // If the original filters were |kInterpolationFilterEightTap| or
724   // |kInterpolationFilterEightTapSharp| then it becomes
725   // |kInterpolationFilterSwitchable|.
726   // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
727   // tap filter.
728   return 4;
729 }
730 
Convolve2D_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int horizontal_filter_id,const int vertical_filter_id,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)731 void Convolve2D_NEON(const void* const reference,
732                      const ptrdiff_t reference_stride,
733                      const int horizontal_filter_index,
734                      const int vertical_filter_index,
735                      const int horizontal_filter_id,
736                      const int vertical_filter_id, const int width,
737                      const int height, void* prediction,
738                      const ptrdiff_t pred_stride) {
739   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
740   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
741   const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
742 
743   // The output of the horizontal filter is guaranteed to fit in 16 bits.
744   uint16_t
745       intermediate_result[kMaxSuperBlockSizeInPixels *
746                           (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
747   const int intermediate_height = height + vertical_taps - 1;
748 
749   const ptrdiff_t src_stride = reference_stride;
750   const auto* src = static_cast<const uint8_t*>(reference) -
751                     (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
752 
753   DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
754                                    width, intermediate_height,
755                                    horizontal_filter_id, horiz_filter_index);
756 
757   // Vertical filter.
758   auto* dest = static_cast<uint8_t*>(prediction);
759   const ptrdiff_t dest_stride = pred_stride;
760   assert(vertical_filter_id != 0);
761 
762   const int16x8_t taps = vmovl_s8(
763       vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
764 
765   if (vertical_taps == 8) {
766     if (width == 2) {
767       Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
768                              taps);
769     } else if (width == 4) {
770       Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
771                              taps);
772     } else {
773       Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
774                           taps);
775     }
776   } else if (vertical_taps == 6) {
777     if (width == 2) {
778       Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
779                              taps);
780     } else if (width == 4) {
781       Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
782                              taps);
783     } else {
784       Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
785                           taps);
786     }
787   } else if (vertical_taps == 4) {
788     if (width == 2) {
789       Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
790                              taps);
791     } else if (width == 4) {
792       Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
793                              taps);
794     } else {
795       Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
796                           taps);
797     }
798   } else {  // |vertical_taps| == 2
799     if (width == 2) {
800       Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
801                              taps);
802     } else if (width == 4) {
803       Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
804                              taps);
805     } else {
806       Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
807                           taps);
808     }
809   }
810 }
811 
812 // There are many opportunities for overreading in scaled convolve, because the
813 // range of starting points for filter windows is anywhere from 0 to 16 for 8
814 // destination pixels, and the window sizes range from 2 to 8. To accommodate
815 // this range concisely, we use |grade_x| to mean the most steps in src that can
816 // be traversed in a single |step_x| increment, i.e. 1 or 2. When grade_x is 2,
817 // we are guaranteed to exceed 8 whole steps in src for every 8 |step_x|
818 // increments. The first load covers the initial elements of src_x, while the
819 // final load covers the taps.
820 template <int grade_x>
LoadSrcVals(const uint8_t * src_x)821 inline uint8x8x3_t LoadSrcVals(const uint8_t* src_x) {
822   uint8x8x3_t ret;
823   const uint8x16_t src_val = vld1q_u8(src_x);
824   ret.val[0] = vget_low_u8(src_val);
825   ret.val[1] = vget_high_u8(src_val);
826   if (grade_x > 1) {
827     ret.val[2] = vld1_u8(src_x + 16);
828   }
829   return ret;
830 }
831 
832 // Pre-transpose the 2 tap filters in |kAbsHalfSubPixelFilters|[3]
GetPositive2TapFilter(const int tap_index)833 inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
834   assert(tap_index < 2);
835   alignas(
836       16) static constexpr uint8_t kAbsHalfSubPixel2TapFilterColumns[2][16] = {
837       {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
838       {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
839 
840   return vld1q_u8(kAbsHalfSubPixel2TapFilterColumns[tap_index]);
841 }
842 
843 template <int grade_x>
ConvolveKernelHorizontal2Tap(const uint8_t * src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)844 inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
845                                          const ptrdiff_t src_stride,
846                                          const int width, const int subpixel_x,
847                                          const int step_x,
848                                          const int intermediate_height,
849                                          int16_t* intermediate) {
850   // Account for the 0-taps that precede the 2 nonzero taps.
851   const int kernel_offset = 3;
852   const int ref_x = subpixel_x >> kScaleSubPixelBits;
853   const int step_x8 = step_x << 3;
854   const uint8x16_t filter_taps0 = GetPositive2TapFilter(0);
855   const uint8x16_t filter_taps1 = GetPositive2TapFilter(1);
856   const uint16x8_t index_steps = vmulq_n_u16(
857       vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
858   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
859 
860   int p = subpixel_x;
861   if (width <= 4) {
862     const uint8_t* src_x =
863         &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
864     // Only add steps to the 10-bit truncated p to avoid overflow.
865     const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
866     const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
867     const uint8x8_t filter_indices =
868         vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
869     // This is a special case. The 2-tap filter has no negative taps, so we
870     // can use unsigned values.
871     // For each x, a lane of tapsK has
872     // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
873     // on x.
874     const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
875                                VQTbl1U8(filter_taps1, filter_indices)};
876     int y = 0;
877     do {
878       // Load a pool of samples to select from using stepped indices.
879       const uint8x16_t src_vals = vld1q_u8(src_x);
880       const uint8x8_t src_indices =
881           vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
882 
883       // For each x, a lane of srcK contains src_x[k].
884       const uint8x8_t src[2] = {
885           VQTbl1U8(src_vals, src_indices),
886           VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
887 
888       vst1q_s16(intermediate,
889                 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
890                              kInterRoundBitsHorizontal - 1));
891       src_x += src_stride;
892       intermediate += kIntermediateStride;
893     } while (++y < intermediate_height);
894     return;
895   }
896 
897   // |width| >= 8
898   int x = 0;
899   do {
900     const uint8_t* src_x =
901         &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
902     int16_t* intermediate_x = intermediate + x;
903     // Only add steps to the 10-bit truncated p to avoid overflow.
904     const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
905     const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
906     const uint8x8_t filter_indices =
907         vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
908                 filter_index_mask);
909     // This is a special case. The 2-tap filter has no negative taps, so we
910     // can use unsigned values.
911     // For each x, a lane of tapsK has
912     // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
913     // on x.
914     const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
915                                VQTbl1U8(filter_taps1, filter_indices)};
916     int y = 0;
917     do {
918       // Load a pool of samples to select from using stepped indices.
919       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
920       const uint8x8_t src_indices =
921           vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
922 
923       // For each x, a lane of srcK contains src_x[k].
924       const uint8x8_t src[2] = {
925           vtbl3_u8(src_vals, src_indices),
926           vtbl3_u8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
927 
928       vst1q_s16(intermediate_x,
929                 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
930                              kInterRoundBitsHorizontal - 1));
931       src_x += src_stride;
932       intermediate_x += kIntermediateStride;
933     } while (++y < intermediate_height);
934     x += 8;
935     p += step_x8;
936   } while (x < width);
937 }
938 
939 // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[5].
GetPositive4TapFilter(const int tap_index)940 inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
941   assert(tap_index < 4);
942   alignas(
943       16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
944       {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
945       {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
946       {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
947       {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
948 
949   return vld1q_u8(kSubPixel4TapPositiveFilterColumns[tap_index]);
950 }
951 
952 // This filter is only possible when width <= 4.
ConvolveKernelHorizontalPositive4Tap(const uint8_t * src,const ptrdiff_t src_stride,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)953 void ConvolveKernelHorizontalPositive4Tap(
954     const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
955     const int step_x, const int intermediate_height, int16_t* intermediate) {
956   const int kernel_offset = 2;
957   const int ref_x = subpixel_x >> kScaleSubPixelBits;
958   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
959   const uint8x16_t filter_taps0 = GetPositive4TapFilter(0);
960   const uint8x16_t filter_taps1 = GetPositive4TapFilter(1);
961   const uint8x16_t filter_taps2 = GetPositive4TapFilter(2);
962   const uint8x16_t filter_taps3 = GetPositive4TapFilter(3);
963   const uint16x8_t index_steps = vmulq_n_u16(
964       vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
965   const int p = subpixel_x;
966   // First filter is special, just a 128 tap on the center.
967   const uint8_t* src_x =
968       &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
969   // Only add steps to the 10-bit truncated p to avoid overflow.
970   const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
971   const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
972   const uint8x8_t filter_indices = vand_u8(
973       vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), filter_index_mask);
974   // Note that filter_id depends on x.
975   // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
976   const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
977                              VQTbl1U8(filter_taps1, filter_indices),
978                              VQTbl1U8(filter_taps2, filter_indices),
979                              VQTbl1U8(filter_taps3, filter_indices)};
980 
981   const uint8x8_t src_indices =
982       vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
983   int y = 0;
984   do {
985     // Load a pool of samples to select from using stepped index vectors.
986     const uint8x16_t src_vals = vld1q_u8(src_x);
987 
988     // For each x, srcK contains src_x[k] where k=1.
989     // Whereas taps come from different arrays, src pixels are drawn from the
990     // same contiguous line.
991     const uint8x8_t src[4] = {
992         VQTbl1U8(src_vals, src_indices),
993         VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1))),
994         VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2))),
995         VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)))};
996 
997     vst1q_s16(intermediate,
998               vrshrq_n_s16(SumOnePassTaps</*filter_index=*/5>(src, taps),
999                            kInterRoundBitsHorizontal - 1));
1000 
1001     src_x += src_stride;
1002     intermediate += kIntermediateStride;
1003   } while (++y < intermediate_height);
1004 }
1005 
1006 // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
GetSigned4TapFilter(const int tap_index)1007 inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
1008   assert(tap_index < 4);
1009   alignas(16) static constexpr uint8_t
1010       kAbsHalfSubPixel4TapSignedFilterColumns[4][16] = {
1011           {0, 2, 4, 5, 6, 6, 7, 6, 6, 5, 5, 5, 4, 3, 2, 1},
1012           {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
1013           {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
1014           {0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 7, 6, 6, 5, 4, 2}};
1015 
1016   return vld1q_u8(kAbsHalfSubPixel4TapSignedFilterColumns[tap_index]);
1017 }
1018 
1019 // This filter is only possible when width <= 4.
ConvolveKernelHorizontalSigned4Tap(const uint8_t * src,const ptrdiff_t src_stride,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)1020 inline void ConvolveKernelHorizontalSigned4Tap(
1021     const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
1022     const int step_x, const int intermediate_height, int16_t* intermediate) {
1023   const int kernel_offset = 2;
1024   const int ref_x = subpixel_x >> kScaleSubPixelBits;
1025   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1026   const uint8x16_t filter_taps0 = GetSigned4TapFilter(0);
1027   const uint8x16_t filter_taps1 = GetSigned4TapFilter(1);
1028   const uint8x16_t filter_taps2 = GetSigned4TapFilter(2);
1029   const uint8x16_t filter_taps3 = GetSigned4TapFilter(3);
1030   const uint16x4_t index_steps = vmul_n_u16(vcreate_u16(0x0003000200010000),
1031                                             static_cast<uint16_t>(step_x));
1032 
1033   const int p = subpixel_x;
1034   const uint8_t* src_x =
1035       &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
1036   // Only add steps to the 10-bit truncated p to avoid overflow.
1037   const uint16x4_t p_fraction = vdup_n_u16(p & 1023);
1038   const uint16x4_t subpel_index_offsets = vadd_u16(index_steps, p_fraction);
1039   const uint8x8_t filter_index_offsets = vshrn_n_u16(
1040       vcombine_u16(subpel_index_offsets, vdup_n_u16(0)), kFilterIndexShift);
1041   const uint8x8_t filter_indices =
1042       vand_u8(filter_index_offsets, filter_index_mask);
1043   // Note that filter_id depends on x.
1044   // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
1045   const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
1046                              VQTbl1U8(filter_taps1, filter_indices),
1047                              VQTbl1U8(filter_taps2, filter_indices),
1048                              VQTbl1U8(filter_taps3, filter_indices)};
1049 
1050   const uint8x8_t src_indices_base =
1051       vshr_n_u8(filter_index_offsets, kScaleSubPixelBits - kFilterIndexShift);
1052 
1053   const uint8x8_t src_indices[4] = {src_indices_base,
1054                                     vadd_u8(src_indices_base, vdup_n_u8(1)),
1055                                     vadd_u8(src_indices_base, vdup_n_u8(2)),
1056                                     vadd_u8(src_indices_base, vdup_n_u8(3))};
1057 
1058   int y = 0;
1059   do {
1060     // Load a pool of samples to select from using stepped indices.
1061     const uint8x16_t src_vals = vld1q_u8(src_x);
1062 
1063     // For each x, srcK contains src_x[k] where k=1.
1064     // Whereas taps come from different arrays, src pixels are drawn from the
1065     // same contiguous line.
1066     const uint8x8_t src[4] = {
1067         VQTbl1U8(src_vals, src_indices[0]), VQTbl1U8(src_vals, src_indices[1]),
1068         VQTbl1U8(src_vals, src_indices[2]), VQTbl1U8(src_vals, src_indices[3])};
1069 
1070     vst1q_s16(intermediate,
1071               vrshrq_n_s16(SumOnePassTaps</*filter_index=*/4>(src, taps),
1072                            kInterRoundBitsHorizontal - 1));
1073     src_x += src_stride;
1074     intermediate += kIntermediateStride;
1075   } while (++y < intermediate_height);
1076 }
1077 
1078 // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
GetSigned6TapFilter(const int tap_index)1079 inline uint8x16_t GetSigned6TapFilter(const int tap_index) {
1080   assert(tap_index < 6);
1081   alignas(16) static constexpr uint8_t
1082       kAbsHalfSubPixel6TapSignedFilterColumns[6][16] = {
1083           {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
1084           {0, 3, 5, 6, 7, 7, 8, 7, 7, 6, 6, 6, 5, 4, 2, 1},
1085           {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
1086           {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
1087           {0, 1, 2, 4, 5, 6, 6, 6, 7, 7, 8, 7, 7, 6, 5, 3},
1088           {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
1089 
1090   return vld1q_u8(kAbsHalfSubPixel6TapSignedFilterColumns[tap_index]);
1091 }
1092 
1093 // This filter is only possible when width >= 8.
1094 template <int grade_x>
ConvolveKernelHorizontalSigned6Tap(const uint8_t * src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)1095 inline void ConvolveKernelHorizontalSigned6Tap(
1096     const uint8_t* src, const ptrdiff_t src_stride, const int width,
1097     const int subpixel_x, const int step_x, const int intermediate_height,
1098     int16_t* intermediate) {
1099   const int kernel_offset = 1;
1100   const uint8x8_t one = vdup_n_u8(1);
1101   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1102   const int ref_x = subpixel_x >> kScaleSubPixelBits;
1103   const int step_x8 = step_x << 3;
1104   uint8x16_t filter_taps[6];
1105   for (int i = 0; i < 6; ++i) {
1106     filter_taps[i] = GetSigned6TapFilter(i);
1107   }
1108   const uint16x8_t index_steps = vmulq_n_u16(
1109       vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
1110 
1111   int x = 0;
1112   int p = subpixel_x;
1113   do {
1114     // Avoid overloading outside the reference boundaries. This means
1115     // |trailing_width| can be up to 24.
1116     const uint8_t* src_x =
1117         &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
1118     int16_t* intermediate_x = intermediate + x;
1119     // Only add steps to the 10-bit truncated p to avoid overflow.
1120     const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
1121     const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
1122     const uint8x8_t src_indices =
1123         vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
1124     uint8x8_t src_lookup[6];
1125     src_lookup[0] = src_indices;
1126     for (int i = 1; i < 6; ++i) {
1127       src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
1128     }
1129 
1130     const uint8x8_t filter_indices =
1131         vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
1132                 filter_index_mask);
1133     // For each x, a lane of taps[k] has
1134     // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
1135     // on x.
1136     uint8x8_t taps[6];
1137     for (int i = 0; i < 6; ++i) {
1138       taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
1139     }
1140     int y = 0;
1141     do {
1142       // Load a pool of samples to select from using stepped indices.
1143       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
1144 
1145       const uint8x8_t src[6] = {
1146           vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
1147           vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
1148           vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5])};
1149 
1150       vst1q_s16(intermediate_x,
1151                 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/0>(src, taps),
1152                              kInterRoundBitsHorizontal - 1));
1153       src_x += src_stride;
1154       intermediate_x += kIntermediateStride;
1155     } while (++y < intermediate_height);
1156     x += 8;
1157     p += step_x8;
1158   } while (x < width);
1159 }
1160 
1161 // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[1]. This filter
1162 // has mixed positive and negative outer taps which are handled in
1163 // GetMixed6TapFilter().
GetPositive6TapFilter(const int tap_index)1164 inline uint8x16_t GetPositive6TapFilter(const int tap_index) {
1165   assert(tap_index < 6);
1166   alignas(16) static constexpr uint8_t
1167       kAbsHalfSubPixel6TapPositiveFilterColumns[4][16] = {
1168           {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
1169           {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
1170           {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
1171           {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14}};
1172 
1173   return vld1q_u8(kAbsHalfSubPixel6TapPositiveFilterColumns[tap_index]);
1174 }
1175 
GetMixed6TapFilter(const int tap_index)1176 inline int8x16_t GetMixed6TapFilter(const int tap_index) {
1177   assert(tap_index < 2);
1178   alignas(
1179       16) static constexpr int8_t kHalfSubPixel6TapMixedFilterColumns[2][16] = {
1180       {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
1181       {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
1182 
1183   return vld1q_s8(kHalfSubPixel6TapMixedFilterColumns[tap_index]);
1184 }
1185 
1186 // This filter is only possible when width >= 8.
1187 template <int grade_x>
ConvolveKernelHorizontalMixed6Tap(const uint8_t * src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)1188 inline void ConvolveKernelHorizontalMixed6Tap(
1189     const uint8_t* src, const ptrdiff_t src_stride, const int width,
1190     const int subpixel_x, const int step_x, const int intermediate_height,
1191     int16_t* intermediate) {
1192   const int kernel_offset = 1;
1193   const uint8x8_t one = vdup_n_u8(1);
1194   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1195   const int ref_x = subpixel_x >> kScaleSubPixelBits;
1196   const int step_x8 = step_x << 3;
1197   uint8x8_t taps[4];
1198   int16x8_t mixed_taps[2];
1199   uint8x16_t positive_filter_taps[4];
1200   for (int i = 0; i < 4; ++i) {
1201     positive_filter_taps[i] = GetPositive6TapFilter(i);
1202   }
1203   int8x16_t mixed_filter_taps[2];
1204   mixed_filter_taps[0] = GetMixed6TapFilter(0);
1205   mixed_filter_taps[1] = GetMixed6TapFilter(1);
1206   const uint16x8_t index_steps = vmulq_n_u16(
1207       vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
1208 
1209   int x = 0;
1210   int p = subpixel_x;
1211   do {
1212     const uint8_t* src_x =
1213         &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
1214     int16_t* intermediate_x = intermediate + x;
1215     // Only add steps to the 10-bit truncated p to avoid overflow.
1216     const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
1217     const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
1218     const uint8x8_t src_indices =
1219         vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
1220     uint8x8_t src_lookup[6];
1221     src_lookup[0] = src_indices;
1222     for (int i = 1; i < 6; ++i) {
1223       src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
1224     }
1225 
1226     const uint8x8_t filter_indices =
1227         vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
1228                 filter_index_mask);
1229     // For each x, a lane of taps[k] has
1230     // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
1231     // on x.
1232     for (int i = 0; i < 4; ++i) {
1233       taps[i] = VQTbl1U8(positive_filter_taps[i], filter_indices);
1234     }
1235     mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
1236     mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
1237 
1238     int y = 0;
1239     do {
1240       // Load a pool of samples to select from using stepped indices.
1241       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
1242 
1243       int16x8_t sum_mixed = vmulq_s16(
1244           mixed_taps[0], ZeroExtend(vtbl3_u8(src_vals, src_lookup[0])));
1245       sum_mixed = vmlaq_s16(sum_mixed, mixed_taps[1],
1246                             ZeroExtend(vtbl3_u8(src_vals, src_lookup[5])));
1247       uint16x8_t sum = vreinterpretq_u16_s16(sum_mixed);
1248       sum = vmlal_u8(sum, taps[0], vtbl3_u8(src_vals, src_lookup[1]));
1249       sum = vmlal_u8(sum, taps[1], vtbl3_u8(src_vals, src_lookup[2]));
1250       sum = vmlal_u8(sum, taps[2], vtbl3_u8(src_vals, src_lookup[3]));
1251       sum = vmlal_u8(sum, taps[3], vtbl3_u8(src_vals, src_lookup[4]));
1252 
1253       vst1q_s16(intermediate_x, vrshrq_n_s16(vreinterpretq_s16_u16(sum),
1254                                              kInterRoundBitsHorizontal - 1));
1255       src_x += src_stride;
1256       intermediate_x += kIntermediateStride;
1257     } while (++y < intermediate_height);
1258     x += 8;
1259     p += step_x8;
1260   } while (x < width);
1261 }
1262 
1263 // Pre-transpose the 8 tap filters in |kAbsHalfSubPixelFilters|[2].
GetSigned8TapFilter(const int tap_index)1264 inline uint8x16_t GetSigned8TapFilter(const int tap_index) {
1265   assert(tap_index < 8);
1266   alignas(16) static constexpr uint8_t
1267       kAbsHalfSubPixel8TapSignedFilterColumns[8][16] = {
1268           {0, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0},
1269           {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
1270           {0, 3, 6, 9, 11, 11, 12, 12, 12, 11, 10, 9, 7, 5, 3, 1},
1271           {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
1272           {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
1273           {0, 1, 3, 5, 7, 9, 10, 11, 12, 12, 12, 11, 11, 9, 6, 3},
1274           {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
1275           {0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1}};
1276 
1277   return vld1q_u8(kAbsHalfSubPixel8TapSignedFilterColumns[tap_index]);
1278 }
1279 
1280 // This filter is only possible when width >= 8.
1281 template <int grade_x>
ConvolveKernelHorizontalSigned8Tap(const uint8_t * src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)1282 inline void ConvolveKernelHorizontalSigned8Tap(
1283     const uint8_t* src, const ptrdiff_t src_stride, const int width,
1284     const int subpixel_x, const int step_x, const int intermediate_height,
1285     int16_t* intermediate) {
1286   const uint8x8_t one = vdup_n_u8(1);
1287   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1288   const int ref_x = subpixel_x >> kScaleSubPixelBits;
1289   const int step_x8 = step_x << 3;
1290   uint8x8_t taps[8];
1291   uint8x16_t filter_taps[8];
1292   for (int i = 0; i < 8; ++i) {
1293     filter_taps[i] = GetSigned8TapFilter(i);
1294   }
1295   const uint16x8_t index_steps = vmulq_n_u16(
1296       vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
1297   int x = 0;
1298   int p = subpixel_x;
1299   do {
1300     const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
1301     int16_t* intermediate_x = intermediate + x;
1302     // Only add steps to the 10-bit truncated p to avoid overflow.
1303     const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
1304     const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
1305     const uint8x8_t src_indices =
1306         vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
1307     uint8x8_t src_lookup[8];
1308     src_lookup[0] = src_indices;
1309     for (int i = 1; i < 8; ++i) {
1310       src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
1311     }
1312 
1313     const uint8x8_t filter_indices =
1314         vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
1315                 filter_index_mask);
1316     // For each x, a lane of taps[k] has
1317     // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
1318     // on x.
1319     for (int i = 0; i < 8; ++i) {
1320       taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
1321     }
1322 
1323     int y = 0;
1324     do {
1325       // Load a pool of samples to select from using stepped indices.
1326       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
1327 
1328       const uint8x8_t src[8] = {
1329           vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
1330           vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
1331           vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5]),
1332           vtbl3_u8(src_vals, src_lookup[6]), vtbl3_u8(src_vals, src_lookup[7])};
1333 
1334       vst1q_s16(intermediate_x,
1335                 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/2>(src, taps),
1336                              kInterRoundBitsHorizontal - 1));
1337       src_x += src_stride;
1338       intermediate_x += kIntermediateStride;
1339     } while (++y < intermediate_height);
1340     x += 8;
1341     p += step_x8;
1342   } while (x < width);
1343 }
1344 
1345 // This function handles blocks of width 2 or 4.
1346 template <int num_taps, int grade_y, int width, bool is_compound>
ConvolveVerticalScale4xH(const int16_t * src,const int subpixel_y,const int filter_index,const int step_y,const int height,void * dest,const ptrdiff_t dest_stride)1347 void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
1348                               const int filter_index, const int step_y,
1349                               const int height, void* dest,
1350                               const ptrdiff_t dest_stride) {
1351   constexpr ptrdiff_t src_stride = kIntermediateStride;
1352   const int16_t* src_y = src;
1353   // |dest| is 16-bit in compound mode, Pixel otherwise.
1354   uint16_t* dest16_y = static_cast<uint16_t*>(dest);
1355   uint8_t* dest_y = static_cast<uint8_t*>(dest);
1356   int16x4_t s[num_taps + grade_y];
1357 
1358   int p = subpixel_y & 1023;
1359   int prev_p = p;
1360   int y = 0;
1361   do {  // y < height
1362     for (int i = 0; i < num_taps; ++i) {
1363       s[i] = vld1_s16(src_y + i * src_stride);
1364     }
1365     int filter_id = (p >> 6) & kSubPixelMask;
1366     int16x8_t filter =
1367         vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1368     int16x4_t sums = Sum2DVerticalTaps4<num_taps, is_compound>(s, filter);
1369     if (is_compound) {
1370       assert(width != 2);
1371       const uint16x4_t result = vreinterpret_u16_s16(sums);
1372       vst1_u16(dest16_y, result);
1373     } else {
1374       const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
1375       if (width == 2) {
1376         Store2<0>(dest_y, result);
1377       } else {
1378         StoreLo4(dest_y, result);
1379       }
1380     }
1381     p += step_y;
1382     const int p_diff =
1383         (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
1384     prev_p = p;
1385     // Here we load extra source in case it is needed. If |p_diff| == 0, these
1386     // values will be unused, but it's faster to load than to branch.
1387     s[num_taps] = vld1_s16(src_y + num_taps * src_stride);
1388     if (grade_y > 1) {
1389       s[num_taps + 1] = vld1_s16(src_y + (num_taps + 1) * src_stride);
1390     }
1391     dest16_y += dest_stride;
1392     dest_y += dest_stride;
1393 
1394     filter_id = (p >> 6) & kSubPixelMask;
1395     filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1396     sums = Sum2DVerticalTaps4<num_taps, is_compound>(&s[p_diff], filter);
1397     if (is_compound) {
1398       assert(width != 2);
1399       const uint16x4_t result = vreinterpret_u16_s16(sums);
1400       vst1_u16(dest16_y, result);
1401     } else {
1402       const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
1403       if (width == 2) {
1404         Store2<0>(dest_y, result);
1405       } else {
1406         StoreLo4(dest_y, result);
1407       }
1408     }
1409     p += step_y;
1410     src_y = src + (p >> kScaleSubPixelBits) * src_stride;
1411     prev_p = p;
1412     dest16_y += dest_stride;
1413     dest_y += dest_stride;
1414 
1415     y += 2;
1416   } while (y < height);
1417 }
1418 
1419 template <int num_taps, int grade_y, bool is_compound>
ConvolveVerticalScale(const int16_t * src,const int width,const int subpixel_y,const int filter_index,const int step_y,const int height,void * dest,const ptrdiff_t dest_stride)1420 inline void ConvolveVerticalScale(const int16_t* src, const int width,
1421                                   const int subpixel_y, const int filter_index,
1422                                   const int step_y, const int height,
1423                                   void* dest, const ptrdiff_t dest_stride) {
1424   constexpr ptrdiff_t src_stride = kIntermediateStride;
1425   // A possible improvement is to use arithmetic to decide how many times to
1426   // apply filters to same source before checking whether to load new srcs.
1427   // However, this will only improve performance with very small step sizes.
1428   int16x8_t s[num_taps + grade_y];
1429   // |dest| is 16-bit in compound mode, Pixel otherwise.
1430   uint16_t* dest16_y;
1431   uint8_t* dest_y;
1432 
1433   int x = 0;
1434   do {  // x < width
1435     const int16_t* src_x = src + x;
1436     const int16_t* src_y = src_x;
1437     dest16_y = static_cast<uint16_t*>(dest) + x;
1438     dest_y = static_cast<uint8_t*>(dest) + x;
1439     int p = subpixel_y & 1023;
1440     int prev_p = p;
1441     int y = 0;
1442     do {  // y < height
1443       for (int i = 0; i < num_taps; ++i) {
1444         s[i] = vld1q_s16(src_y + i * src_stride);
1445       }
1446       int filter_id = (p >> 6) & kSubPixelMask;
1447       int16x8_t filter =
1448           vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1449       int16x8_t sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(s, filter);
1450       if (is_compound) {
1451         vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
1452       } else {
1453         vst1_u8(dest_y, vqmovun_s16(sum));
1454       }
1455       p += step_y;
1456       const int p_diff =
1457           (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
1458       // |grade_y| > 1 always means p_diff > 0, so load vectors that may be
1459       // needed. Otherwise, we only need to load one vector because |p_diff|
1460       // can't exceed 1.
1461       s[num_taps] = vld1q_s16(src_y + num_taps * src_stride);
1462       if (grade_y > 1) {
1463         s[num_taps + 1] = vld1q_s16(src_y + (num_taps + 1) * src_stride);
1464       }
1465       dest16_y += dest_stride;
1466       dest_y += dest_stride;
1467 
1468       filter_id = (p >> 6) & kSubPixelMask;
1469       filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1470       sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(&s[p_diff], filter);
1471       if (is_compound) {
1472         vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
1473       } else {
1474         vst1_u8(dest_y, vqmovun_s16(sum));
1475       }
1476       p += step_y;
1477       src_y = src_x + (p >> kScaleSubPixelBits) * src_stride;
1478       prev_p = p;
1479       dest16_y += dest_stride;
1480       dest_y += dest_stride;
1481 
1482       y += 2;
1483     } while (y < height);
1484     x += 8;
1485   } while (x < width);
1486 }
1487 
1488 template <bool is_compound>
ConvolveScale2D_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int subpixel_x,const int subpixel_y,const int step_x,const int step_y,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)1489 void ConvolveScale2D_NEON(const void* const reference,
1490                           const ptrdiff_t reference_stride,
1491                           const int horizontal_filter_index,
1492                           const int vertical_filter_index, const int subpixel_x,
1493                           const int subpixel_y, const int step_x,
1494                           const int step_y, const int width, const int height,
1495                           void* prediction, const ptrdiff_t pred_stride) {
1496   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
1497   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
1498   assert(step_x <= 2048);
1499   const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
1500   const int intermediate_height =
1501       (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
1502        kScaleSubPixelBits) +
1503       num_vert_taps;
1504   assert(step_x <= 2048);
1505   // The output of the horizontal filter, i.e. the intermediate_result, is
1506   // guaranteed to fit in int16_t.
1507   int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
1508                               (2 * kMaxSuperBlockSizeInPixels + 8)];
1509 
1510   // Horizontal filter.
1511   // Filter types used for width <= 4 are different from those for width > 4.
1512   // When width > 4, the valid filter index range is always [0, 3].
1513   // When width <= 4, the valid filter index range is always [3, 5].
1514   // Similarly for height.
1515   int filter_index = GetFilterIndex(horizontal_filter_index, width);
1516   int16_t* intermediate = intermediate_result;
1517   const ptrdiff_t src_stride = reference_stride;
1518   const auto* src = static_cast<const uint8_t*>(reference);
1519   const int vert_kernel_offset = (8 - num_vert_taps) / 2;
1520   src += vert_kernel_offset * src_stride;
1521 
1522   // Derive the maximum value of |step_x| at which all source values fit in one
1523   // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
1524   // step_x*7 is the final base subpel index for the shuffle mask for filter
1525   // inputs in each iteration on large blocks. When step_x is large, we need a
1526   // larger structure and use a larger table lookup in order to gather all
1527   // filter inputs.
1528   // |num_taps| - 1 is the shuffle index of the final filter input.
1529   const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
1530   const int kernel_start_ceiling = 16 - num_horiz_taps;
1531   // This truncated quotient |grade_x_threshold| selects |step_x| such that:
1532   // (step_x * 7) >> kScaleSubPixelBits < single load limit
1533   const int grade_x_threshold =
1534       (kernel_start_ceiling << kScaleSubPixelBits) / 7;
1535   switch (filter_index) {
1536     case 0:
1537       if (step_x > grade_x_threshold) {
1538         ConvolveKernelHorizontalSigned6Tap<2>(
1539             src, src_stride, width, subpixel_x, step_x, intermediate_height,
1540             intermediate);
1541       } else {
1542         ConvolveKernelHorizontalSigned6Tap<1>(
1543             src, src_stride, width, subpixel_x, step_x, intermediate_height,
1544             intermediate);
1545       }
1546       break;
1547     case 1:
1548       if (step_x > grade_x_threshold) {
1549         ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x,
1550                                              step_x, intermediate_height,
1551                                              intermediate);
1552 
1553       } else {
1554         ConvolveKernelHorizontalMixed6Tap<1>(src, src_stride, width, subpixel_x,
1555                                              step_x, intermediate_height,
1556                                              intermediate);
1557       }
1558       break;
1559     case 2:
1560       if (step_x > grade_x_threshold) {
1561         ConvolveKernelHorizontalSigned8Tap<2>(
1562             src, src_stride, width, subpixel_x, step_x, intermediate_height,
1563             intermediate);
1564       } else {
1565         ConvolveKernelHorizontalSigned8Tap<1>(
1566             src, src_stride, width, subpixel_x, step_x, intermediate_height,
1567             intermediate);
1568       }
1569       break;
1570     case 3:
1571       if (step_x > grade_x_threshold) {
1572         ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x,
1573                                         step_x, intermediate_height,
1574                                         intermediate);
1575       } else {
1576         ConvolveKernelHorizontal2Tap<1>(src, src_stride, width, subpixel_x,
1577                                         step_x, intermediate_height,
1578                                         intermediate);
1579       }
1580       break;
1581     case 4:
1582       assert(width <= 4);
1583       ConvolveKernelHorizontalSigned4Tap(src, src_stride, subpixel_x, step_x,
1584                                          intermediate_height, intermediate);
1585       break;
1586     default:
1587       assert(filter_index == 5);
1588       ConvolveKernelHorizontalPositive4Tap(src, src_stride, subpixel_x, step_x,
1589                                            intermediate_height, intermediate);
1590   }
1591   // Vertical filter.
1592   filter_index = GetFilterIndex(vertical_filter_index, height);
1593   intermediate = intermediate_result;
1594 
1595   switch (filter_index) {
1596     case 0:
1597     case 1:
1598       if (step_y <= 1024) {
1599         if (!is_compound && width == 2) {
1600           ConvolveVerticalScale4xH<6, 1, 2, is_compound>(
1601               intermediate, subpixel_y, filter_index, step_y, height,
1602               prediction, pred_stride);
1603         } else if (width == 4) {
1604           ConvolveVerticalScale4xH<6, 1, 4, is_compound>(
1605               intermediate, subpixel_y, filter_index, step_y, height,
1606               prediction, pred_stride);
1607         } else {
1608           ConvolveVerticalScale<6, 1, is_compound>(
1609               intermediate, width, subpixel_y, filter_index, step_y, height,
1610               prediction, pred_stride);
1611         }
1612       } else {
1613         if (!is_compound && width == 2) {
1614           ConvolveVerticalScale4xH<6, 2, 2, is_compound>(
1615               intermediate, subpixel_y, filter_index, step_y, height,
1616               prediction, pred_stride);
1617         } else if (width == 4) {
1618           ConvolveVerticalScale4xH<6, 2, 4, is_compound>(
1619               intermediate, subpixel_y, filter_index, step_y, height,
1620               prediction, pred_stride);
1621         } else {
1622           ConvolveVerticalScale<6, 2, is_compound>(
1623               intermediate, width, subpixel_y, filter_index, step_y, height,
1624               prediction, pred_stride);
1625         }
1626       }
1627       break;
1628     case 2:
1629       if (step_y <= 1024) {
1630         if (!is_compound && width == 2) {
1631           ConvolveVerticalScale4xH<8, 1, 2, is_compound>(
1632               intermediate, subpixel_y, filter_index, step_y, height,
1633               prediction, pred_stride);
1634         } else if (width == 4) {
1635           ConvolveVerticalScale4xH<8, 1, 4, is_compound>(
1636               intermediate, subpixel_y, filter_index, step_y, height,
1637               prediction, pred_stride);
1638         } else {
1639           ConvolveVerticalScale<8, 1, is_compound>(
1640               intermediate, width, subpixel_y, filter_index, step_y, height,
1641               prediction, pred_stride);
1642         }
1643       } else {
1644         if (!is_compound && width == 2) {
1645           ConvolveVerticalScale4xH<8, 2, 2, is_compound>(
1646               intermediate, subpixel_y, filter_index, step_y, height,
1647               prediction, pred_stride);
1648         } else if (width == 4) {
1649           ConvolveVerticalScale4xH<8, 2, 4, is_compound>(
1650               intermediate, subpixel_y, filter_index, step_y, height,
1651               prediction, pred_stride);
1652         } else {
1653           ConvolveVerticalScale<8, 2, is_compound>(
1654               intermediate, width, subpixel_y, filter_index, step_y, height,
1655               prediction, pred_stride);
1656         }
1657       }
1658       break;
1659     case 3:
1660       if (step_y <= 1024) {
1661         if (!is_compound && width == 2) {
1662           ConvolveVerticalScale4xH<2, 1, 2, is_compound>(
1663               intermediate, subpixel_y, filter_index, step_y, height,
1664               prediction, pred_stride);
1665         } else if (width == 4) {
1666           ConvolveVerticalScale4xH<2, 1, 4, is_compound>(
1667               intermediate, subpixel_y, filter_index, step_y, height,
1668               prediction, pred_stride);
1669         } else {
1670           ConvolveVerticalScale<2, 1, is_compound>(
1671               intermediate, width, subpixel_y, filter_index, step_y, height,
1672               prediction, pred_stride);
1673         }
1674       } else {
1675         if (!is_compound && width == 2) {
1676           ConvolveVerticalScale4xH<2, 2, 2, is_compound>(
1677               intermediate, subpixel_y, filter_index, step_y, height,
1678               prediction, pred_stride);
1679         } else if (width == 4) {
1680           ConvolveVerticalScale4xH<2, 2, 4, is_compound>(
1681               intermediate, subpixel_y, filter_index, step_y, height,
1682               prediction, pred_stride);
1683         } else {
1684           ConvolveVerticalScale<2, 2, is_compound>(
1685               intermediate, width, subpixel_y, filter_index, step_y, height,
1686               prediction, pred_stride);
1687         }
1688       }
1689       break;
1690     case 4:
1691     default:
1692       assert(filter_index == 4 || filter_index == 5);
1693       assert(height <= 4);
1694       if (step_y <= 1024) {
1695         if (!is_compound && width == 2) {
1696           ConvolveVerticalScale4xH<4, 1, 2, is_compound>(
1697               intermediate, subpixel_y, filter_index, step_y, height,
1698               prediction, pred_stride);
1699         } else if (width == 4) {
1700           ConvolveVerticalScale4xH<4, 1, 4, is_compound>(
1701               intermediate, subpixel_y, filter_index, step_y, height,
1702               prediction, pred_stride);
1703         } else {
1704           ConvolveVerticalScale<4, 1, is_compound>(
1705               intermediate, width, subpixel_y, filter_index, step_y, height,
1706               prediction, pred_stride);
1707         }
1708       } else {
1709         if (!is_compound && width == 2) {
1710           ConvolveVerticalScale4xH<4, 2, 2, is_compound>(
1711               intermediate, subpixel_y, filter_index, step_y, height,
1712               prediction, pred_stride);
1713         } else if (width == 4) {
1714           ConvolveVerticalScale4xH<4, 2, 4, is_compound>(
1715               intermediate, subpixel_y, filter_index, step_y, height,
1716               prediction, pred_stride);
1717         } else {
1718           ConvolveVerticalScale<4, 2, is_compound>(
1719               intermediate, width, subpixel_y, filter_index, step_y, height,
1720               prediction, pred_stride);
1721         }
1722       }
1723   }
1724 }
1725 
ConvolveHorizontal_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int,const int horizontal_filter_id,const int,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)1726 void ConvolveHorizontal_NEON(const void* const reference,
1727                              const ptrdiff_t reference_stride,
1728                              const int horizontal_filter_index,
1729                              const int /*vertical_filter_index*/,
1730                              const int horizontal_filter_id,
1731                              const int /*vertical_filter_id*/, const int width,
1732                              const int height, void* prediction,
1733                              const ptrdiff_t pred_stride) {
1734   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
1735   // Set |src| to the outermost tap.
1736   const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
1737   auto* dest = static_cast<uint8_t*>(prediction);
1738 
1739   DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
1740                    horizontal_filter_id, filter_index);
1741 }
1742 
1743 // The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
1744 // Vertical calculations.
Compound1DShift(const int16x8_t sum)1745 uint16x8_t Compound1DShift(const int16x8_t sum) {
1746   return vreinterpretq_u16_s16(
1747       vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
1748 }
1749 
1750 template <int filter_index, bool is_compound = false,
1751           bool negative_outside_taps = false>
FilterVertical(const uint8_t * src,const ptrdiff_t src_stride,void * const dst,const ptrdiff_t dst_stride,const int width,const int height,const uint8x8_t * const taps)1752 void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
1753                     void* const dst, const ptrdiff_t dst_stride,
1754                     const int width, const int height,
1755                     const uint8x8_t* const taps) {
1756   const int num_taps = GetNumTapsInFilter(filter_index);
1757   const int next_row = num_taps - 1;
1758   auto* dst8 = static_cast<uint8_t*>(dst);
1759   auto* dst16 = static_cast<uint16_t*>(dst);
1760   assert(width >= 8);
1761 
1762   int x = 0;
1763   do {
1764     const uint8_t* src_x = src + x;
1765     uint8x8_t srcs[8];
1766     srcs[0] = vld1_u8(src_x);
1767     src_x += src_stride;
1768     if (num_taps >= 4) {
1769       srcs[1] = vld1_u8(src_x);
1770       src_x += src_stride;
1771       srcs[2] = vld1_u8(src_x);
1772       src_x += src_stride;
1773       if (num_taps >= 6) {
1774         srcs[3] = vld1_u8(src_x);
1775         src_x += src_stride;
1776         srcs[4] = vld1_u8(src_x);
1777         src_x += src_stride;
1778         if (num_taps == 8) {
1779           srcs[5] = vld1_u8(src_x);
1780           src_x += src_stride;
1781           srcs[6] = vld1_u8(src_x);
1782           src_x += src_stride;
1783         }
1784       }
1785     }
1786 
1787     int y = 0;
1788     do {
1789       srcs[next_row] = vld1_u8(src_x);
1790       src_x += src_stride;
1791 
1792       const int16x8_t sums =
1793           SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1794       if (is_compound) {
1795         const uint16x8_t results = Compound1DShift(sums);
1796         vst1q_u16(dst16 + x + y * dst_stride, results);
1797       } else {
1798         const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1799         vst1_u8(dst8 + x + y * dst_stride, results);
1800       }
1801 
1802       srcs[0] = srcs[1];
1803       if (num_taps >= 4) {
1804         srcs[1] = srcs[2];
1805         srcs[2] = srcs[3];
1806         if (num_taps >= 6) {
1807           srcs[3] = srcs[4];
1808           srcs[4] = srcs[5];
1809           if (num_taps == 8) {
1810             srcs[5] = srcs[6];
1811             srcs[6] = srcs[7];
1812           }
1813         }
1814       }
1815     } while (++y < height);
1816     x += 8;
1817   } while (x < width);
1818 }
1819 
1820 template <int filter_index, bool is_compound = false,
1821           bool negative_outside_taps = false>
FilterVertical4xH(const uint8_t * src,const ptrdiff_t src_stride,void * const dst,const ptrdiff_t dst_stride,const int height,const uint8x8_t * const taps)1822 void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
1823                        void* const dst, const ptrdiff_t dst_stride,
1824                        const int height, const uint8x8_t* const taps) {
1825   const int num_taps = GetNumTapsInFilter(filter_index);
1826   auto* dst8 = static_cast<uint8_t*>(dst);
1827   auto* dst16 = static_cast<uint16_t*>(dst);
1828 
1829   uint8x8_t srcs[9];
1830 
1831   if (num_taps == 2) {
1832     srcs[2] = vdup_n_u8(0);
1833 
1834     srcs[0] = Load4(src);
1835     src += src_stride;
1836 
1837     int y = 0;
1838     do {
1839       srcs[0] = Load4<1>(src, srcs[0]);
1840       src += src_stride;
1841       srcs[2] = Load4<0>(src, srcs[2]);
1842       src += src_stride;
1843       srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1844 
1845       const int16x8_t sums =
1846           SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1847       if (is_compound) {
1848         const uint16x8_t results = Compound1DShift(sums);
1849 
1850         vst1q_u16(dst16, results);
1851         dst16 += 4 << 1;
1852       } else {
1853         const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1854 
1855         StoreLo4(dst8, results);
1856         dst8 += dst_stride;
1857         StoreHi4(dst8, results);
1858         dst8 += dst_stride;
1859       }
1860 
1861       srcs[0] = srcs[2];
1862       y += 2;
1863     } while (y < height);
1864   } else if (num_taps == 4) {
1865     srcs[4] = vdup_n_u8(0);
1866 
1867     srcs[0] = Load4(src);
1868     src += src_stride;
1869     srcs[0] = Load4<1>(src, srcs[0]);
1870     src += src_stride;
1871     srcs[2] = Load4(src);
1872     src += src_stride;
1873     srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1874 
1875     int y = 0;
1876     do {
1877       srcs[2] = Load4<1>(src, srcs[2]);
1878       src += src_stride;
1879       srcs[4] = Load4<0>(src, srcs[4]);
1880       src += src_stride;
1881       srcs[3] = vext_u8(srcs[2], srcs[4], 4);
1882 
1883       const int16x8_t sums =
1884           SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1885       if (is_compound) {
1886         const uint16x8_t results = Compound1DShift(sums);
1887 
1888         vst1q_u16(dst16, results);
1889         dst16 += 4 << 1;
1890       } else {
1891         const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1892 
1893         StoreLo4(dst8, results);
1894         dst8 += dst_stride;
1895         StoreHi4(dst8, results);
1896         dst8 += dst_stride;
1897       }
1898 
1899       srcs[0] = srcs[2];
1900       srcs[1] = srcs[3];
1901       srcs[2] = srcs[4];
1902       y += 2;
1903     } while (y < height);
1904   } else if (num_taps == 6) {
1905     srcs[6] = vdup_n_u8(0);
1906 
1907     srcs[0] = Load4(src);
1908     src += src_stride;
1909     srcs[0] = Load4<1>(src, srcs[0]);
1910     src += src_stride;
1911     srcs[2] = Load4(src);
1912     src += src_stride;
1913     srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1914     srcs[2] = Load4<1>(src, srcs[2]);
1915     src += src_stride;
1916     srcs[4] = Load4(src);
1917     src += src_stride;
1918     srcs[3] = vext_u8(srcs[2], srcs[4], 4);
1919 
1920     int y = 0;
1921     do {
1922       srcs[4] = Load4<1>(src, srcs[4]);
1923       src += src_stride;
1924       srcs[6] = Load4<0>(src, srcs[6]);
1925       src += src_stride;
1926       srcs[5] = vext_u8(srcs[4], srcs[6], 4);
1927 
1928       const int16x8_t sums =
1929           SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1930       if (is_compound) {
1931         const uint16x8_t results = Compound1DShift(sums);
1932 
1933         vst1q_u16(dst16, results);
1934         dst16 += 4 << 1;
1935       } else {
1936         const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1937 
1938         StoreLo4(dst8, results);
1939         dst8 += dst_stride;
1940         StoreHi4(dst8, results);
1941         dst8 += dst_stride;
1942       }
1943 
1944       srcs[0] = srcs[2];
1945       srcs[1] = srcs[3];
1946       srcs[2] = srcs[4];
1947       srcs[3] = srcs[5];
1948       srcs[4] = srcs[6];
1949       y += 2;
1950     } while (y < height);
1951   } else if (num_taps == 8) {
1952     srcs[8] = vdup_n_u8(0);
1953 
1954     srcs[0] = Load4(src);
1955     src += src_stride;
1956     srcs[0] = Load4<1>(src, srcs[0]);
1957     src += src_stride;
1958     srcs[2] = Load4(src);
1959     src += src_stride;
1960     srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1961     srcs[2] = Load4<1>(src, srcs[2]);
1962     src += src_stride;
1963     srcs[4] = Load4(src);
1964     src += src_stride;
1965     srcs[3] = vext_u8(srcs[2], srcs[4], 4);
1966     srcs[4] = Load4<1>(src, srcs[4]);
1967     src += src_stride;
1968     srcs[6] = Load4(src);
1969     src += src_stride;
1970     srcs[5] = vext_u8(srcs[4], srcs[6], 4);
1971 
1972     int y = 0;
1973     do {
1974       srcs[6] = Load4<1>(src, srcs[6]);
1975       src += src_stride;
1976       srcs[8] = Load4<0>(src, srcs[8]);
1977       src += src_stride;
1978       srcs[7] = vext_u8(srcs[6], srcs[8], 4);
1979 
1980       const int16x8_t sums =
1981           SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1982       if (is_compound) {
1983         const uint16x8_t results = Compound1DShift(sums);
1984 
1985         vst1q_u16(dst16, results);
1986         dst16 += 4 << 1;
1987       } else {
1988         const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1989 
1990         StoreLo4(dst8, results);
1991         dst8 += dst_stride;
1992         StoreHi4(dst8, results);
1993         dst8 += dst_stride;
1994       }
1995 
1996       srcs[0] = srcs[2];
1997       srcs[1] = srcs[3];
1998       srcs[2] = srcs[4];
1999       srcs[3] = srcs[5];
2000       srcs[4] = srcs[6];
2001       srcs[5] = srcs[7];
2002       srcs[6] = srcs[8];
2003       y += 2;
2004     } while (y < height);
2005   }
2006 }
2007 
2008 template <int filter_index, bool negative_outside_taps = false>
FilterVertical2xH(const uint8_t * src,const ptrdiff_t src_stride,void * const dst,const ptrdiff_t dst_stride,const int height,const uint8x8_t * const taps)2009 void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
2010                        void* const dst, const ptrdiff_t dst_stride,
2011                        const int height, const uint8x8_t* const taps) {
2012   const int num_taps = GetNumTapsInFilter(filter_index);
2013   auto* dst8 = static_cast<uint8_t*>(dst);
2014 
2015   uint8x8_t srcs[9];
2016 
2017   if (num_taps == 2) {
2018     srcs[2] = vdup_n_u8(0);
2019 
2020     srcs[0] = Load2(src);
2021     src += src_stride;
2022 
2023     int y = 0;
2024     do {
2025       srcs[0] = Load2<1>(src, srcs[0]);
2026       src += src_stride;
2027       srcs[0] = Load2<2>(src, srcs[0]);
2028       src += src_stride;
2029       srcs[0] = Load2<3>(src, srcs[0]);
2030       src += src_stride;
2031       srcs[2] = Load2<0>(src, srcs[2]);
2032       src += src_stride;
2033       srcs[1] = vext_u8(srcs[0], srcs[2], 2);
2034 
2035       // This uses srcs[0]..srcs[1].
2036       const int16x8_t sums =
2037           SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2038       const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2039 
2040       Store2<0>(dst8, results);
2041       dst8 += dst_stride;
2042       Store2<1>(dst8, results);
2043       if (height == 2) return;
2044       dst8 += dst_stride;
2045       Store2<2>(dst8, results);
2046       dst8 += dst_stride;
2047       Store2<3>(dst8, results);
2048       dst8 += dst_stride;
2049 
2050       srcs[0] = srcs[2];
2051       y += 4;
2052     } while (y < height);
2053   } else if (num_taps == 4) {
2054     srcs[4] = vdup_n_u8(0);
2055 
2056     srcs[0] = Load2(src);
2057     src += src_stride;
2058     srcs[0] = Load2<1>(src, srcs[0]);
2059     src += src_stride;
2060     srcs[0] = Load2<2>(src, srcs[0]);
2061     src += src_stride;
2062 
2063     int y = 0;
2064     do {
2065       srcs[0] = Load2<3>(src, srcs[0]);
2066       src += src_stride;
2067       srcs[4] = Load2<0>(src, srcs[4]);
2068       src += src_stride;
2069       srcs[1] = vext_u8(srcs[0], srcs[4], 2);
2070       srcs[4] = Load2<1>(src, srcs[4]);
2071       src += src_stride;
2072       srcs[2] = vext_u8(srcs[0], srcs[4], 4);
2073       srcs[4] = Load2<2>(src, srcs[4]);
2074       src += src_stride;
2075       srcs[3] = vext_u8(srcs[0], srcs[4], 6);
2076 
2077       // This uses srcs[0]..srcs[3].
2078       const int16x8_t sums =
2079           SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2080       const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2081 
2082       Store2<0>(dst8, results);
2083       dst8 += dst_stride;
2084       Store2<1>(dst8, results);
2085       if (height == 2) return;
2086       dst8 += dst_stride;
2087       Store2<2>(dst8, results);
2088       dst8 += dst_stride;
2089       Store2<3>(dst8, results);
2090       dst8 += dst_stride;
2091 
2092       srcs[0] = srcs[4];
2093       y += 4;
2094     } while (y < height);
2095   } else if (num_taps == 6) {
2096     // During the vertical pass the number of taps is restricted when
2097     // |height| <= 4.
2098     assert(height > 4);
2099     srcs[8] = vdup_n_u8(0);
2100 
2101     srcs[0] = Load2(src);
2102     src += src_stride;
2103     srcs[0] = Load2<1>(src, srcs[0]);
2104     src += src_stride;
2105     srcs[0] = Load2<2>(src, srcs[0]);
2106     src += src_stride;
2107     srcs[0] = Load2<3>(src, srcs[0]);
2108     src += src_stride;
2109     srcs[4] = Load2(src);
2110     src += src_stride;
2111     srcs[1] = vext_u8(srcs[0], srcs[4], 2);
2112 
2113     int y = 0;
2114     do {
2115       srcs[4] = Load2<1>(src, srcs[4]);
2116       src += src_stride;
2117       srcs[2] = vext_u8(srcs[0], srcs[4], 4);
2118       srcs[4] = Load2<2>(src, srcs[4]);
2119       src += src_stride;
2120       srcs[3] = vext_u8(srcs[0], srcs[4], 6);
2121       srcs[4] = Load2<3>(src, srcs[4]);
2122       src += src_stride;
2123       srcs[8] = Load2<0>(src, srcs[8]);
2124       src += src_stride;
2125       srcs[5] = vext_u8(srcs[4], srcs[8], 2);
2126 
2127       // This uses srcs[0]..srcs[5].
2128       const int16x8_t sums =
2129           SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2130       const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2131 
2132       Store2<0>(dst8, results);
2133       dst8 += dst_stride;
2134       Store2<1>(dst8, results);
2135       dst8 += dst_stride;
2136       Store2<2>(dst8, results);
2137       dst8 += dst_stride;
2138       Store2<3>(dst8, results);
2139       dst8 += dst_stride;
2140 
2141       srcs[0] = srcs[4];
2142       srcs[1] = srcs[5];
2143       srcs[4] = srcs[8];
2144       y += 4;
2145     } while (y < height);
2146   } else if (num_taps == 8) {
2147     // During the vertical pass the number of taps is restricted when
2148     // |height| <= 4.
2149     assert(height > 4);
2150     srcs[8] = vdup_n_u8(0);
2151 
2152     srcs[0] = Load2(src);
2153     src += src_stride;
2154     srcs[0] = Load2<1>(src, srcs[0]);
2155     src += src_stride;
2156     srcs[0] = Load2<2>(src, srcs[0]);
2157     src += src_stride;
2158     srcs[0] = Load2<3>(src, srcs[0]);
2159     src += src_stride;
2160     srcs[4] = Load2(src);
2161     src += src_stride;
2162     srcs[1] = vext_u8(srcs[0], srcs[4], 2);
2163     srcs[4] = Load2<1>(src, srcs[4]);
2164     src += src_stride;
2165     srcs[2] = vext_u8(srcs[0], srcs[4], 4);
2166     srcs[4] = Load2<2>(src, srcs[4]);
2167     src += src_stride;
2168     srcs[3] = vext_u8(srcs[0], srcs[4], 6);
2169 
2170     int y = 0;
2171     do {
2172       srcs[4] = Load2<3>(src, srcs[4]);
2173       src += src_stride;
2174       srcs[8] = Load2<0>(src, srcs[8]);
2175       src += src_stride;
2176       srcs[5] = vext_u8(srcs[4], srcs[8], 2);
2177       srcs[8] = Load2<1>(src, srcs[8]);
2178       src += src_stride;
2179       srcs[6] = vext_u8(srcs[4], srcs[8], 4);
2180       srcs[8] = Load2<2>(src, srcs[8]);
2181       src += src_stride;
2182       srcs[7] = vext_u8(srcs[4], srcs[8], 6);
2183 
2184       // This uses srcs[0]..srcs[7].
2185       const int16x8_t sums =
2186           SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2187       const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2188 
2189       Store2<0>(dst8, results);
2190       dst8 += dst_stride;
2191       Store2<1>(dst8, results);
2192       dst8 += dst_stride;
2193       Store2<2>(dst8, results);
2194       dst8 += dst_stride;
2195       Store2<3>(dst8, results);
2196       dst8 += dst_stride;
2197 
2198       srcs[0] = srcs[4];
2199       srcs[1] = srcs[5];
2200       srcs[2] = srcs[6];
2201       srcs[3] = srcs[7];
2202       srcs[4] = srcs[8];
2203       y += 4;
2204     } while (y < height);
2205   }
2206 }
2207 
2208 // This function is a simplified version of Convolve2D_C.
2209 // It is called when it is single prediction mode, where only vertical
2210 // filtering is required.
2211 // The output is the single prediction of the block, clipped to valid pixel
2212 // range.
ConvolveVertical_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int vertical_filter_index,const int,const int vertical_filter_id,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)2213 void ConvolveVertical_NEON(const void* const reference,
2214                            const ptrdiff_t reference_stride,
2215                            const int /*horizontal_filter_index*/,
2216                            const int vertical_filter_index,
2217                            const int /*horizontal_filter_id*/,
2218                            const int vertical_filter_id, const int width,
2219                            const int height, void* prediction,
2220                            const ptrdiff_t pred_stride) {
2221   const int filter_index = GetFilterIndex(vertical_filter_index, height);
2222   const int vertical_taps = GetNumTapsInFilter(filter_index);
2223   const ptrdiff_t src_stride = reference_stride;
2224   const auto* src = static_cast<const uint8_t*>(reference) -
2225                     (vertical_taps / 2 - 1) * src_stride;
2226   auto* dest = static_cast<uint8_t*>(prediction);
2227   const ptrdiff_t dest_stride = pred_stride;
2228   assert(vertical_filter_id != 0);
2229 
2230   uint8x8_t taps[8];
2231   for (int k = 0; k < kSubPixelTaps; ++k) {
2232     taps[k] =
2233         vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
2234   }
2235 
2236   if (filter_index == 0) {  // 6 tap.
2237     if (width == 2) {
2238       FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height,
2239                            taps + 1);
2240     } else if (width == 4) {
2241       FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height,
2242                            taps + 1);
2243     } else {
2244       FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
2245                         taps + 1);
2246     }
2247   } else if ((filter_index == 1) & ((vertical_filter_id == 1) |
2248                                     (vertical_filter_id == 15))) {  // 5 tap.
2249     if (width == 2) {
2250       FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
2251                            taps + 1);
2252     } else if (width == 4) {
2253       FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height,
2254                            taps + 1);
2255     } else {
2256       FilterVertical<1>(src, src_stride, dest, dest_stride, width, height,
2257                         taps + 1);
2258     }
2259   } else if ((filter_index == 1) &
2260              ((vertical_filter_id == 7) | (vertical_filter_id == 8) |
2261               (vertical_filter_id == 9))) {  // 6 tap with weird negative taps.
2262     if (width == 2) {
2263       FilterVertical2xH<1,
2264                         /*negative_outside_taps=*/true>(
2265           src, src_stride, dest, dest_stride, height, taps + 1);
2266     } else if (width == 4) {
2267       FilterVertical4xH<1, /*is_compound=*/false,
2268                         /*negative_outside_taps=*/true>(
2269           src, src_stride, dest, dest_stride, height, taps + 1);
2270     } else {
2271       FilterVertical<1, /*is_compound=*/false, /*negative_outside_taps=*/true>(
2272           src, src_stride, dest, dest_stride, width, height, taps + 1);
2273     }
2274   } else if (filter_index == 2) {  // 8 tap.
2275     if (width == 2) {
2276       FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
2277     } else if (width == 4) {
2278       FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
2279     } else {
2280       FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
2281                         taps);
2282     }
2283   } else if (filter_index == 3) {  // 2 tap.
2284     if (width == 2) {
2285       FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height,
2286                            taps + 3);
2287     } else if (width == 4) {
2288       FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height,
2289                            taps + 3);
2290     } else {
2291       FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
2292                         taps + 3);
2293     }
2294   } else if (filter_index == 4) {  // 4 tap.
2295     // Outside taps are negative.
2296     if (width == 2) {
2297       FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height,
2298                            taps + 2);
2299     } else if (width == 4) {
2300       FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height,
2301                            taps + 2);
2302     } else {
2303       FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
2304                         taps + 2);
2305     }
2306   } else {
2307     // 4 tap. When |filter_index| == 1 the |vertical_filter_id| values listed
2308     // below map to 4 tap filters.
2309     assert(filter_index == 5 ||
2310            (filter_index == 1 &&
2311             (vertical_filter_id == 2 || vertical_filter_id == 3 ||
2312              vertical_filter_id == 4 || vertical_filter_id == 5 ||
2313              vertical_filter_id == 6 || vertical_filter_id == 10 ||
2314              vertical_filter_id == 11 || vertical_filter_id == 12 ||
2315              vertical_filter_id == 13 || vertical_filter_id == 14)));
2316     // According to GetNumTapsInFilter() this has 6 taps but here we are
2317     // treating it as though it has 4.
2318     if (filter_index == 1) src += src_stride;
2319     if (width == 2) {
2320       FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height,
2321                            taps + 2);
2322     } else if (width == 4) {
2323       FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height,
2324                            taps + 2);
2325     } else {
2326       FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
2327                         taps + 2);
2328     }
2329   }
2330 }
2331 
ConvolveCompoundCopy_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * prediction,const ptrdiff_t)2332 void ConvolveCompoundCopy_NEON(
2333     const void* const reference, const ptrdiff_t reference_stride,
2334     const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
2335     const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
2336     const int width, const int height, void* prediction,
2337     const ptrdiff_t /*pred_stride*/) {
2338   const auto* src = static_cast<const uint8_t*>(reference);
2339   const ptrdiff_t src_stride = reference_stride;
2340   auto* dest = static_cast<uint16_t*>(prediction);
2341   constexpr int final_shift =
2342       kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
2343 
2344   if (width >= 16) {
2345     int y = 0;
2346     do {
2347       int x = 0;
2348       do {
2349         const uint8x16_t v_src = vld1q_u8(&src[x]);
2350         const uint16x8_t v_dest_lo =
2351             vshll_n_u8(vget_low_u8(v_src), final_shift);
2352         const uint16x8_t v_dest_hi =
2353             vshll_n_u8(vget_high_u8(v_src), final_shift);
2354         vst1q_u16(&dest[x], v_dest_lo);
2355         x += 8;
2356         vst1q_u16(&dest[x], v_dest_hi);
2357         x += 8;
2358       } while (x < width);
2359       src += src_stride;
2360       dest += width;
2361     } while (++y < height);
2362   } else if (width == 8) {
2363     int y = 0;
2364     do {
2365       const uint8x8_t v_src = vld1_u8(&src[0]);
2366       const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
2367       vst1q_u16(&dest[0], v_dest);
2368       src += src_stride;
2369       dest += width;
2370     } while (++y < height);
2371   } else { /* width == 4 */
2372     uint8x8_t v_src = vdup_n_u8(0);
2373 
2374     int y = 0;
2375     do {
2376       v_src = Load4<0>(&src[0], v_src);
2377       src += src_stride;
2378       v_src = Load4<1>(&src[0], v_src);
2379       src += src_stride;
2380       const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
2381       vst1q_u16(&dest[0], v_dest);
2382       dest += 4 << 1;
2383       y += 2;
2384     } while (y < height);
2385   }
2386 }
2387 
ConvolveCompoundVertical_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int vertical_filter_index,const int,const int vertical_filter_id,const int width,const int height,void * prediction,const ptrdiff_t)2388 void ConvolveCompoundVertical_NEON(
2389     const void* const reference, const ptrdiff_t reference_stride,
2390     const int /*horizontal_filter_index*/, const int vertical_filter_index,
2391     const int /*horizontal_filter_id*/, const int vertical_filter_id,
2392     const int width, const int height, void* prediction,
2393     const ptrdiff_t /*pred_stride*/) {
2394   const int filter_index = GetFilterIndex(vertical_filter_index, height);
2395   const int vertical_taps = GetNumTapsInFilter(filter_index);
2396   const ptrdiff_t src_stride = reference_stride;
2397   const auto* src = static_cast<const uint8_t*>(reference) -
2398                     (vertical_taps / 2 - 1) * src_stride;
2399   auto* dest = static_cast<uint16_t*>(prediction);
2400   assert(vertical_filter_id != 0);
2401 
2402   uint8x8_t taps[8];
2403   for (int k = 0; k < kSubPixelTaps; ++k) {
2404     taps[k] =
2405         vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
2406   }
2407 
2408   if (filter_index == 0) {  // 6 tap.
2409     if (width == 4) {
2410       FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
2411                                                  height, taps + 1);
2412     } else {
2413       FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
2414                                               width, height, taps + 1);
2415     }
2416   } else if ((filter_index == 1) & ((vertical_filter_id == 1) |
2417                                     (vertical_filter_id == 15))) {  // 5 tap.
2418     if (width == 4) {
2419       FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
2420                                                  height, taps + 1);
2421     } else {
2422       FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width,
2423                                               width, height, taps + 1);
2424     }
2425   } else if ((filter_index == 1) &
2426              ((vertical_filter_id == 7) | (vertical_filter_id == 8) |
2427               (vertical_filter_id == 9))) {  // 6 tap with weird negative taps.
2428     if (width == 4) {
2429       FilterVertical4xH<1, /*is_compound=*/true,
2430                         /*negative_outside_taps=*/true>(src, src_stride, dest,
2431                                                         4, height, taps + 1);
2432     } else {
2433       FilterVertical<1, /*is_compound=*/true, /*negative_outside_taps=*/true>(
2434           src, src_stride, dest, width, width, height, taps + 1);
2435     }
2436   } else if (filter_index == 2) {  // 8 tap.
2437     if (width == 4) {
2438       FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
2439                                                  height, taps);
2440     } else {
2441       FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
2442                                               width, height, taps);
2443     }
2444   } else if (filter_index == 3) {  // 2 tap.
2445     if (width == 4) {
2446       FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
2447                                                  height, taps + 3);
2448     } else {
2449       FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
2450                                               width, height, taps + 3);
2451     }
2452   } else if (filter_index == 4) {  // 4 tap.
2453     if (width == 4) {
2454       FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
2455                                                  height, taps + 2);
2456     } else {
2457       FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
2458                                               width, height, taps + 2);
2459     }
2460   } else {
2461     // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
2462     // to 4 tap filters.
2463     assert(filter_index == 5 ||
2464            (filter_index == 1 &&
2465             (vertical_filter_id == 2 || vertical_filter_id == 3 ||
2466              vertical_filter_id == 4 || vertical_filter_id == 5 ||
2467              vertical_filter_id == 6 || vertical_filter_id == 10 ||
2468              vertical_filter_id == 11 || vertical_filter_id == 12 ||
2469              vertical_filter_id == 13 || vertical_filter_id == 14)));
2470     // According to GetNumTapsInFilter() this has 6 taps but here we are
2471     // treating it as though it has 4.
2472     if (filter_index == 1) src += src_stride;
2473     if (width == 4) {
2474       FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
2475                                                  height, taps + 2);
2476     } else {
2477       FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
2478                                               width, height, taps + 2);
2479     }
2480   }
2481 }
2482 
ConvolveCompoundHorizontal_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int,const int horizontal_filter_id,const int,const int width,const int height,void * prediction,const ptrdiff_t)2483 void ConvolveCompoundHorizontal_NEON(
2484     const void* const reference, const ptrdiff_t reference_stride,
2485     const int horizontal_filter_index, const int /*vertical_filter_index*/,
2486     const int horizontal_filter_id, const int /*vertical_filter_id*/,
2487     const int width, const int height, void* prediction,
2488     const ptrdiff_t /*pred_stride*/) {
2489   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
2490   const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
2491   auto* dest = static_cast<uint16_t*>(prediction);
2492 
2493   DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
2494       src, reference_stride, dest, width, width, height, horizontal_filter_id,
2495       filter_index);
2496 }
2497 
ConvolveCompound2D_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int horizontal_filter_id,const int vertical_filter_id,const int width,const int height,void * prediction,const ptrdiff_t)2498 void ConvolveCompound2D_NEON(const void* const reference,
2499                              const ptrdiff_t reference_stride,
2500                              const int horizontal_filter_index,
2501                              const int vertical_filter_index,
2502                              const int horizontal_filter_id,
2503                              const int vertical_filter_id, const int width,
2504                              const int height, void* prediction,
2505                              const ptrdiff_t /*pred_stride*/) {
2506   // The output of the horizontal filter, i.e. the intermediate_result, is
2507   // guaranteed to fit in int16_t.
2508   uint16_t
2509       intermediate_result[kMaxSuperBlockSizeInPixels *
2510                           (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
2511 
2512   // Horizontal filter.
2513   // Filter types used for width <= 4 are different from those for width > 4.
2514   // When width > 4, the valid filter index range is always [0, 3].
2515   // When width <= 4, the valid filter index range is always [4, 5].
2516   // Similarly for height.
2517   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
2518   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
2519   const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
2520   const int intermediate_height = height + vertical_taps - 1;
2521   const ptrdiff_t src_stride = reference_stride;
2522   const auto* const src = static_cast<const uint8_t*>(reference) -
2523                           (vertical_taps / 2 - 1) * src_stride -
2524                           kHorizontalOffset;
2525 
2526   DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
2527       src, src_stride, intermediate_result, width, width, intermediate_height,
2528       horizontal_filter_id, horiz_filter_index);
2529 
2530   // Vertical filter.
2531   auto* dest = static_cast<uint16_t*>(prediction);
2532   assert(vertical_filter_id != 0);
2533 
2534   const ptrdiff_t dest_stride = width;
2535   const int16x8_t taps = vmovl_s8(
2536       vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
2537 
2538   if (vertical_taps == 8) {
2539     if (width == 4) {
2540       Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
2541                                                    dest_stride, height, taps);
2542     } else {
2543       Filter2DVertical<8, /*is_compound=*/true>(
2544           intermediate_result, dest, dest_stride, width, height, taps);
2545     }
2546   } else if (vertical_taps == 6) {
2547     if (width == 4) {
2548       Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
2549                                                    dest_stride, height, taps);
2550     } else {
2551       Filter2DVertical<6, /*is_compound=*/true>(
2552           intermediate_result, dest, dest_stride, width, height, taps);
2553     }
2554   } else if (vertical_taps == 4) {
2555     if (width == 4) {
2556       Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
2557                                                    dest_stride, height, taps);
2558     } else {
2559       Filter2DVertical<4, /*is_compound=*/true>(
2560           intermediate_result, dest, dest_stride, width, height, taps);
2561     }
2562   } else {  // |vertical_taps| == 2
2563     if (width == 4) {
2564       Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
2565                                                    dest_stride, height, taps);
2566     } else {
2567       Filter2DVertical<2, /*is_compound=*/true>(
2568           intermediate_result, dest, dest_stride, width, height, taps);
2569     }
2570   }
2571 }
2572 
HalfAddHorizontal(const uint8_t * src,uint8_t * dst)2573 inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
2574   const uint8x16_t left = vld1q_u8(src);
2575   const uint8x16_t right = vld1q_u8(src + 1);
2576   vst1q_u8(dst, vrhaddq_u8(left, right));
2577 }
2578 
2579 template <int width>
IntraBlockCopyHorizontal(const uint8_t * src,const ptrdiff_t src_stride,const int height,uint8_t * dst,const ptrdiff_t dst_stride)2580 inline void IntraBlockCopyHorizontal(const uint8_t* src,
2581                                      const ptrdiff_t src_stride,
2582                                      const int height, uint8_t* dst,
2583                                      const ptrdiff_t dst_stride) {
2584   const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
2585   const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
2586 
2587   int y = 0;
2588   do {
2589     HalfAddHorizontal(src, dst);
2590     if (width >= 32) {
2591       src += 16;
2592       dst += 16;
2593       HalfAddHorizontal(src, dst);
2594       if (width >= 64) {
2595         src += 16;
2596         dst += 16;
2597         HalfAddHorizontal(src, dst);
2598         src += 16;
2599         dst += 16;
2600         HalfAddHorizontal(src, dst);
2601         if (width == 128) {
2602           src += 16;
2603           dst += 16;
2604           HalfAddHorizontal(src, dst);
2605           src += 16;
2606           dst += 16;
2607           HalfAddHorizontal(src, dst);
2608           src += 16;
2609           dst += 16;
2610           HalfAddHorizontal(src, dst);
2611           src += 16;
2612           dst += 16;
2613           HalfAddHorizontal(src, dst);
2614         }
2615       }
2616     }
2617     src += src_remainder_stride;
2618     dst += dst_remainder_stride;
2619   } while (++y < height);
2620 }
2621 
ConvolveIntraBlockCopyHorizontal_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * const prediction,const ptrdiff_t pred_stride)2622 void ConvolveIntraBlockCopyHorizontal_NEON(
2623     const void* const reference, const ptrdiff_t reference_stride,
2624     const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
2625     const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
2626     const int height, void* const prediction, const ptrdiff_t pred_stride) {
2627   const auto* src = static_cast<const uint8_t*>(reference);
2628   auto* dest = static_cast<uint8_t*>(prediction);
2629 
2630   if (width == 128) {
2631     IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
2632                                   pred_stride);
2633   } else if (width == 64) {
2634     IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
2635                                  pred_stride);
2636   } else if (width == 32) {
2637     IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
2638                                  pred_stride);
2639   } else if (width == 16) {
2640     IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
2641                                  pred_stride);
2642   } else if (width == 8) {
2643     int y = 0;
2644     do {
2645       const uint8x8_t left = vld1_u8(src);
2646       const uint8x8_t right = vld1_u8(src + 1);
2647       vst1_u8(dest, vrhadd_u8(left, right));
2648 
2649       src += reference_stride;
2650       dest += pred_stride;
2651     } while (++y < height);
2652   } else if (width == 4) {
2653     uint8x8_t left = vdup_n_u8(0);
2654     uint8x8_t right = vdup_n_u8(0);
2655     int y = 0;
2656     do {
2657       left = Load4<0>(src, left);
2658       right = Load4<0>(src + 1, right);
2659       src += reference_stride;
2660       left = Load4<1>(src, left);
2661       right = Load4<1>(src + 1, right);
2662       src += reference_stride;
2663 
2664       const uint8x8_t result = vrhadd_u8(left, right);
2665 
2666       StoreLo4(dest, result);
2667       dest += pred_stride;
2668       StoreHi4(dest, result);
2669       dest += pred_stride;
2670       y += 2;
2671     } while (y < height);
2672   } else {
2673     assert(width == 2);
2674     uint8x8_t left = vdup_n_u8(0);
2675     uint8x8_t right = vdup_n_u8(0);
2676     int y = 0;
2677     do {
2678       left = Load2<0>(src, left);
2679       right = Load2<0>(src + 1, right);
2680       src += reference_stride;
2681       left = Load2<1>(src, left);
2682       right = Load2<1>(src + 1, right);
2683       src += reference_stride;
2684 
2685       const uint8x8_t result = vrhadd_u8(left, right);
2686 
2687       Store2<0>(dest, result);
2688       dest += pred_stride;
2689       Store2<1>(dest, result);
2690       dest += pred_stride;
2691       y += 2;
2692     } while (y < height);
2693   }
2694 }
2695 
2696 template <int width>
IntraBlockCopyVertical(const uint8_t * src,const ptrdiff_t src_stride,const int height,uint8_t * dst,const ptrdiff_t dst_stride)2697 inline void IntraBlockCopyVertical(const uint8_t* src,
2698                                    const ptrdiff_t src_stride, const int height,
2699                                    uint8_t* dst, const ptrdiff_t dst_stride) {
2700   const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
2701   const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
2702   uint8x16_t row[8], below[8];
2703 
2704   row[0] = vld1q_u8(src);
2705   if (width >= 32) {
2706     src += 16;
2707     row[1] = vld1q_u8(src);
2708     if (width >= 64) {
2709       src += 16;
2710       row[2] = vld1q_u8(src);
2711       src += 16;
2712       row[3] = vld1q_u8(src);
2713       if (width == 128) {
2714         src += 16;
2715         row[4] = vld1q_u8(src);
2716         src += 16;
2717         row[5] = vld1q_u8(src);
2718         src += 16;
2719         row[6] = vld1q_u8(src);
2720         src += 16;
2721         row[7] = vld1q_u8(src);
2722       }
2723     }
2724   }
2725   src += src_remainder_stride;
2726 
2727   int y = 0;
2728   do {
2729     below[0] = vld1q_u8(src);
2730     if (width >= 32) {
2731       src += 16;
2732       below[1] = vld1q_u8(src);
2733       if (width >= 64) {
2734         src += 16;
2735         below[2] = vld1q_u8(src);
2736         src += 16;
2737         below[3] = vld1q_u8(src);
2738         if (width == 128) {
2739           src += 16;
2740           below[4] = vld1q_u8(src);
2741           src += 16;
2742           below[5] = vld1q_u8(src);
2743           src += 16;
2744           below[6] = vld1q_u8(src);
2745           src += 16;
2746           below[7] = vld1q_u8(src);
2747         }
2748       }
2749     }
2750     src += src_remainder_stride;
2751 
2752     vst1q_u8(dst, vrhaddq_u8(row[0], below[0]));
2753     row[0] = below[0];
2754     if (width >= 32) {
2755       dst += 16;
2756       vst1q_u8(dst, vrhaddq_u8(row[1], below[1]));
2757       row[1] = below[1];
2758       if (width >= 64) {
2759         dst += 16;
2760         vst1q_u8(dst, vrhaddq_u8(row[2], below[2]));
2761         row[2] = below[2];
2762         dst += 16;
2763         vst1q_u8(dst, vrhaddq_u8(row[3], below[3]));
2764         row[3] = below[3];
2765         if (width >= 128) {
2766           dst += 16;
2767           vst1q_u8(dst, vrhaddq_u8(row[4], below[4]));
2768           row[4] = below[4];
2769           dst += 16;
2770           vst1q_u8(dst, vrhaddq_u8(row[5], below[5]));
2771           row[5] = below[5];
2772           dst += 16;
2773           vst1q_u8(dst, vrhaddq_u8(row[6], below[6]));
2774           row[6] = below[6];
2775           dst += 16;
2776           vst1q_u8(dst, vrhaddq_u8(row[7], below[7]));
2777           row[7] = below[7];
2778         }
2779       }
2780     }
2781     dst += dst_remainder_stride;
2782   } while (++y < height);
2783 }
2784 
ConvolveIntraBlockCopyVertical_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * const prediction,const ptrdiff_t pred_stride)2785 void ConvolveIntraBlockCopyVertical_NEON(
2786     const void* const reference, const ptrdiff_t reference_stride,
2787     const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
2788     const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
2789     const int width, const int height, void* const prediction,
2790     const ptrdiff_t pred_stride) {
2791   const auto* src = static_cast<const uint8_t*>(reference);
2792   auto* dest = static_cast<uint8_t*>(prediction);
2793 
2794   if (width == 128) {
2795     IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
2796                                 pred_stride);
2797   } else if (width == 64) {
2798     IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
2799                                pred_stride);
2800   } else if (width == 32) {
2801     IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
2802                                pred_stride);
2803   } else if (width == 16) {
2804     IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
2805                                pred_stride);
2806   } else if (width == 8) {
2807     uint8x8_t row, below;
2808     row = vld1_u8(src);
2809     src += reference_stride;
2810 
2811     int y = 0;
2812     do {
2813       below = vld1_u8(src);
2814       src += reference_stride;
2815 
2816       vst1_u8(dest, vrhadd_u8(row, below));
2817       dest += pred_stride;
2818 
2819       row = below;
2820     } while (++y < height);
2821   } else if (width == 4) {
2822     uint8x8_t row = Load4(src);
2823     uint8x8_t below = vdup_n_u8(0);
2824     src += reference_stride;
2825 
2826     int y = 0;
2827     do {
2828       below = Load4<0>(src, below);
2829       src += reference_stride;
2830 
2831       StoreLo4(dest, vrhadd_u8(row, below));
2832       dest += pred_stride;
2833 
2834       row = below;
2835     } while (++y < height);
2836   } else {
2837     assert(width == 2);
2838     uint8x8_t row = Load2(src);
2839     uint8x8_t below = vdup_n_u8(0);
2840     src += reference_stride;
2841 
2842     int y = 0;
2843     do {
2844       below = Load2<0>(src, below);
2845       src += reference_stride;
2846 
2847       Store2<0>(dest, vrhadd_u8(row, below));
2848       dest += pred_stride;
2849 
2850       row = below;
2851     } while (++y < height);
2852   }
2853 }
2854 
2855 template <int width>
IntraBlockCopy2D(const uint8_t * src,const ptrdiff_t src_stride,const int height,uint8_t * dst,const ptrdiff_t dst_stride)2856 inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
2857                              const int height, uint8_t* dst,
2858                              const ptrdiff_t dst_stride) {
2859   const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
2860   const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
2861   uint16x8_t row[16];
2862   row[0] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2863   if (width >= 16) {
2864     src += 8;
2865     row[1] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2866     if (width >= 32) {
2867       src += 8;
2868       row[2] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2869       src += 8;
2870       row[3] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2871       if (width >= 64) {
2872         src += 8;
2873         row[4] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2874         src += 8;
2875         row[5] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2876         src += 8;
2877         row[6] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2878         src += 8;
2879         row[7] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2880         if (width == 128) {
2881           src += 8;
2882           row[8] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2883           src += 8;
2884           row[9] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2885           src += 8;
2886           row[10] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2887           src += 8;
2888           row[11] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2889           src += 8;
2890           row[12] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2891           src += 8;
2892           row[13] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2893           src += 8;
2894           row[14] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2895           src += 8;
2896           row[15] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2897         }
2898       }
2899     }
2900   }
2901   src += src_remainder_stride;
2902 
2903   int y = 0;
2904   do {
2905     const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2906     vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
2907     row[0] = below_0;
2908     if (width >= 16) {
2909       src += 8;
2910       dst += 8;
2911 
2912       const uint16x8_t below_1 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2913       vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[1], below_1), 2));
2914       row[1] = below_1;
2915       if (width >= 32) {
2916         src += 8;
2917         dst += 8;
2918 
2919         const uint16x8_t below_2 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2920         vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[2], below_2), 2));
2921         row[2] = below_2;
2922         src += 8;
2923         dst += 8;
2924 
2925         const uint16x8_t below_3 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2926         vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[3], below_3), 2));
2927         row[3] = below_3;
2928         if (width >= 64) {
2929           src += 8;
2930           dst += 8;
2931 
2932           const uint16x8_t below_4 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2933           vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[4], below_4), 2));
2934           row[4] = below_4;
2935           src += 8;
2936           dst += 8;
2937 
2938           const uint16x8_t below_5 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2939           vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[5], below_5), 2));
2940           row[5] = below_5;
2941           src += 8;
2942           dst += 8;
2943 
2944           const uint16x8_t below_6 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2945           vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[6], below_6), 2));
2946           row[6] = below_6;
2947           src += 8;
2948           dst += 8;
2949 
2950           const uint16x8_t below_7 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2951           vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[7], below_7), 2));
2952           row[7] = below_7;
2953           if (width == 128) {
2954             src += 8;
2955             dst += 8;
2956 
2957             const uint16x8_t below_8 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2958             vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[8], below_8), 2));
2959             row[8] = below_8;
2960             src += 8;
2961             dst += 8;
2962 
2963             const uint16x8_t below_9 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2964             vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[9], below_9), 2));
2965             row[9] = below_9;
2966             src += 8;
2967             dst += 8;
2968 
2969             const uint16x8_t below_10 =
2970                 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2971             vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[10], below_10), 2));
2972             row[10] = below_10;
2973             src += 8;
2974             dst += 8;
2975 
2976             const uint16x8_t below_11 =
2977                 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2978             vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[11], below_11), 2));
2979             row[11] = below_11;
2980             src += 8;
2981             dst += 8;
2982 
2983             const uint16x8_t below_12 =
2984                 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2985             vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[12], below_12), 2));
2986             row[12] = below_12;
2987             src += 8;
2988             dst += 8;
2989 
2990             const uint16x8_t below_13 =
2991                 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2992             vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[13], below_13), 2));
2993             row[13] = below_13;
2994             src += 8;
2995             dst += 8;
2996 
2997             const uint16x8_t below_14 =
2998                 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2999             vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[14], below_14), 2));
3000             row[14] = below_14;
3001             src += 8;
3002             dst += 8;
3003 
3004             const uint16x8_t below_15 =
3005                 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
3006             vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[15], below_15), 2));
3007             row[15] = below_15;
3008           }
3009         }
3010       }
3011     }
3012     src += src_remainder_stride;
3013     dst += dst_remainder_stride;
3014   } while (++y < height);
3015 }
3016 
ConvolveIntraBlockCopy2D_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * const prediction,const ptrdiff_t pred_stride)3017 void ConvolveIntraBlockCopy2D_NEON(
3018     const void* const reference, const ptrdiff_t reference_stride,
3019     const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
3020     const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
3021     const int width, const int height, void* const prediction,
3022     const ptrdiff_t pred_stride) {
3023   const auto* src = static_cast<const uint8_t*>(reference);
3024   auto* dest = static_cast<uint8_t*>(prediction);
3025   // Note: allow vertical access to height + 1. Because this function is only
3026   // for u/v plane of intra block copy, such access is guaranteed to be within
3027   // the prediction block.
3028 
3029   if (width == 128) {
3030     IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
3031   } else if (width == 64) {
3032     IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
3033   } else if (width == 32) {
3034     IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
3035   } else if (width == 16) {
3036     IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
3037   } else if (width == 8) {
3038     IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
3039   } else if (width == 4) {
3040     uint8x8_t left = Load4(src);
3041     uint8x8_t right = Load4(src + 1);
3042     src += reference_stride;
3043 
3044     uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
3045 
3046     int y = 0;
3047     do {
3048       left = Load4<0>(src, left);
3049       right = Load4<0>(src + 1, right);
3050       src += reference_stride;
3051       left = Load4<1>(src, left);
3052       right = Load4<1>(src + 1, right);
3053       src += reference_stride;
3054 
3055       const uint16x8_t below = vaddl_u8(left, right);
3056 
3057       const uint8x8_t result = vrshrn_n_u16(
3058           vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
3059       StoreLo4(dest, result);
3060       dest += pred_stride;
3061       StoreHi4(dest, result);
3062       dest += pred_stride;
3063 
3064       row = vget_high_u16(below);
3065       y += 2;
3066     } while (y < height);
3067   } else {
3068     uint8x8_t left = Load2(src);
3069     uint8x8_t right = Load2(src + 1);
3070     src += reference_stride;
3071 
3072     uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
3073 
3074     int y = 0;
3075     do {
3076       left = Load2<0>(src, left);
3077       right = Load2<0>(src + 1, right);
3078       src += reference_stride;
3079       left = Load2<2>(src, left);
3080       right = Load2<2>(src + 1, right);
3081       src += reference_stride;
3082 
3083       const uint16x8_t below = vaddl_u8(left, right);
3084 
3085       const uint8x8_t result = vrshrn_n_u16(
3086           vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
3087       Store2<0>(dest, result);
3088       dest += pred_stride;
3089       Store2<2>(dest, result);
3090       dest += pred_stride;
3091 
3092       row = vget_high_u16(below);
3093       y += 2;
3094     } while (y < height);
3095   }
3096 }
3097 
Init8bpp()3098 void Init8bpp() {
3099   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
3100   assert(dsp != nullptr);
3101   dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON;
3102   dsp->convolve[0][0][1][0] = ConvolveVertical_NEON;
3103   dsp->convolve[0][0][1][1] = Convolve2D_NEON;
3104 
3105   dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON;
3106   dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON;
3107   dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON;
3108   dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON;
3109 
3110   dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_NEON;
3111   dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_NEON;
3112   dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_NEON;
3113 
3114   dsp->convolve_scale[0] = ConvolveScale2D_NEON<false>;
3115   dsp->convolve_scale[1] = ConvolveScale2D_NEON<true>;
3116 }
3117 
3118 }  // namespace
3119 }  // namespace low_bitdepth
3120 
ConvolveInit_NEON()3121 void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); }
3122 
3123 }  // namespace dsp
3124 }  // namespace libgav1
3125 
3126 #else  // !LIBGAV1_ENABLE_NEON
3127 
3128 namespace libgav1 {
3129 namespace dsp {
3130 
ConvolveInit_NEON()3131 void ConvolveInit_NEON() {}
3132 
3133 }  // namespace dsp
3134 }  // namespace libgav1
3135 #endif  // LIBGAV1_ENABLE_NEON
3136