1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/convolve.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_ENABLE_NEON
19
20 #include <arm_neon.h>
21
22 #include <algorithm>
23 #include <cassert>
24 #include <cstddef>
25 #include <cstdint>
26
27 #include "src/dsp/arm/common_neon.h"
28 #include "src/dsp/constants.h"
29 #include "src/dsp/dsp.h"
30 #include "src/utils/common.h"
31 #include "src/utils/compiler_attributes.h"
32
33 namespace libgav1 {
34 namespace dsp {
35 namespace low_bitdepth {
36 namespace {
37
38 constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels;
39 constexpr int kHorizontalOffset = 3;
40 constexpr int kFilterIndexShift = 6;
41
42 // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
43 // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
44 // sum from outranging int16_t.
45 template <int filter_index, bool negative_outside_taps = false>
SumOnePassTaps(const uint8x8_t * const src,const uint8x8_t * const taps)46 int16x8_t SumOnePassTaps(const uint8x8_t* const src,
47 const uint8x8_t* const taps) {
48 uint16x8_t sum;
49 if (filter_index == 0) {
50 // 6 taps. + - + + - +
51 sum = vmull_u8(src[0], taps[0]);
52 // Unsigned overflow will result in a valid int16_t value.
53 sum = vmlsl_u8(sum, src[1], taps[1]);
54 sum = vmlal_u8(sum, src[2], taps[2]);
55 sum = vmlal_u8(sum, src[3], taps[3]);
56 sum = vmlsl_u8(sum, src[4], taps[4]);
57 sum = vmlal_u8(sum, src[5], taps[5]);
58 } else if (filter_index == 1 && negative_outside_taps) {
59 // 6 taps. - + + + + -
60 // Set a base we can subtract from.
61 sum = vmull_u8(src[1], taps[1]);
62 sum = vmlsl_u8(sum, src[0], taps[0]);
63 sum = vmlal_u8(sum, src[2], taps[2]);
64 sum = vmlal_u8(sum, src[3], taps[3]);
65 sum = vmlal_u8(sum, src[4], taps[4]);
66 sum = vmlsl_u8(sum, src[5], taps[5]);
67 } else if (filter_index == 1) {
68 // 6 taps. All are positive.
69 sum = vmull_u8(src[0], taps[0]);
70 sum = vmlal_u8(sum, src[1], taps[1]);
71 sum = vmlal_u8(sum, src[2], taps[2]);
72 sum = vmlal_u8(sum, src[3], taps[3]);
73 sum = vmlal_u8(sum, src[4], taps[4]);
74 sum = vmlal_u8(sum, src[5], taps[5]);
75 } else if (filter_index == 2) {
76 // 8 taps. - + - + + - + -
77 sum = vmull_u8(src[1], taps[1]);
78 sum = vmlsl_u8(sum, src[0], taps[0]);
79 sum = vmlsl_u8(sum, src[2], taps[2]);
80 sum = vmlal_u8(sum, src[3], taps[3]);
81 sum = vmlal_u8(sum, src[4], taps[4]);
82 sum = vmlsl_u8(sum, src[5], taps[5]);
83 sum = vmlal_u8(sum, src[6], taps[6]);
84 sum = vmlsl_u8(sum, src[7], taps[7]);
85 } else if (filter_index == 3) {
86 // 2 taps. All are positive.
87 sum = vmull_u8(src[0], taps[0]);
88 sum = vmlal_u8(sum, src[1], taps[1]);
89 } else if (filter_index == 4) {
90 // 4 taps. - + + -
91 sum = vmull_u8(src[1], taps[1]);
92 sum = vmlsl_u8(sum, src[0], taps[0]);
93 sum = vmlal_u8(sum, src[2], taps[2]);
94 sum = vmlsl_u8(sum, src[3], taps[3]);
95 } else if (filter_index == 5) {
96 // 4 taps. All are positive.
97 sum = vmull_u8(src[0], taps[0]);
98 sum = vmlal_u8(sum, src[1], taps[1]);
99 sum = vmlal_u8(sum, src[2], taps[2]);
100 sum = vmlal_u8(sum, src[3], taps[3]);
101 }
102 return vreinterpretq_s16_u16(sum);
103 }
104
105 template <int filter_index, bool negative_outside_taps>
SumHorizontalTaps(const uint8_t * const src,const uint8x8_t * const v_tap)106 int16x8_t SumHorizontalTaps(const uint8_t* const src,
107 const uint8x8_t* const v_tap) {
108 uint8x8_t v_src[8];
109 const uint8x16_t src_long = vld1q_u8(src);
110 int16x8_t sum;
111
112 if (filter_index < 2) {
113 v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 1));
114 v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 2));
115 v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 3));
116 v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 4));
117 v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 5));
118 v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 6));
119 sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 1);
120 } else if (filter_index == 2) {
121 v_src[0] = vget_low_u8(src_long);
122 v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
123 v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
124 v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
125 v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
126 v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
127 v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
128 v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
129 sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap);
130 } else if (filter_index == 3) {
131 v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 3));
132 v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 4));
133 sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 3);
134 } else if (filter_index > 3) {
135 v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 2));
136 v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 3));
137 v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 4));
138 v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 5));
139 sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 2);
140 }
141 return sum;
142 }
143
144 template <int filter_index, bool negative_outside_taps>
SimpleHorizontalTaps(const uint8_t * const src,const uint8x8_t * const v_tap)145 uint8x8_t SimpleHorizontalTaps(const uint8_t* const src,
146 const uint8x8_t* const v_tap) {
147 int16x8_t sum =
148 SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
149
150 // Normally the Horizontal pass does the downshift in two passes:
151 // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
152 // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
153 // requires adding the rounding offset from the skipped shift.
154 constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
155
156 sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
157 return vqrshrun_n_s16(sum, kFilterBits - 1);
158 }
159
160 template <int filter_index, bool negative_outside_taps>
HorizontalTaps8To16(const uint8_t * const src,const uint8x8_t * const v_tap)161 uint16x8_t HorizontalTaps8To16(const uint8_t* const src,
162 const uint8x8_t* const v_tap) {
163 const int16x8_t sum =
164 SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
165
166 return vreinterpretq_u16_s16(
167 vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
168 }
169
170 template <int filter_index>
SumHorizontalTaps2x2(const uint8_t * src,const ptrdiff_t src_stride,const uint8x8_t * const v_tap)171 int16x8_t SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
172 const uint8x8_t* const v_tap) {
173 uint16x8_t sum;
174 const uint8x8_t input0 = vld1_u8(src);
175 src += src_stride;
176 const uint8x8_t input1 = vld1_u8(src);
177 uint8x8x2_t input = vzip_u8(input0, input1);
178
179 if (filter_index == 3) {
180 // tap signs : + +
181 sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
182 sum = vmlal_u8(sum, input.val[1], v_tap[4]);
183 } else if (filter_index == 4) {
184 // tap signs : - + + -
185 sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
186 sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
187 sum = vmlal_u8(sum, input.val[1], v_tap[4]);
188 sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
189 } else {
190 // tap signs : + + + +
191 sum = vmull_u8(RightShift<4 * 8>(input.val[0]), v_tap[2]);
192 sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
193 sum = vmlal_u8(sum, input.val[1], v_tap[4]);
194 sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
195 }
196
197 return vreinterpretq_s16_u16(sum);
198 }
199
200 template <int filter_index>
SimpleHorizontalTaps2x2(const uint8_t * src,const ptrdiff_t src_stride,const uint8x8_t * const v_tap)201 uint8x8_t SimpleHorizontalTaps2x2(const uint8_t* src,
202 const ptrdiff_t src_stride,
203 const uint8x8_t* const v_tap) {
204 int16x8_t sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
205
206 // Normally the Horizontal pass does the downshift in two passes:
207 // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
208 // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
209 // requires adding the rounding offset from the skipped shift.
210 constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
211
212 sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
213 return vqrshrun_n_s16(sum, kFilterBits - 1);
214 }
215
216 template <int filter_index>
HorizontalTaps8To16_2x2(const uint8_t * src,const ptrdiff_t src_stride,const uint8x8_t * const v_tap)217 uint16x8_t HorizontalTaps8To16_2x2(const uint8_t* src,
218 const ptrdiff_t src_stride,
219 const uint8x8_t* const v_tap) {
220 const int16x8_t sum =
221 SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
222
223 return vreinterpretq_u16_s16(
224 vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
225 }
226
227 template <int num_taps, int step, int filter_index,
228 bool negative_outside_taps = true, bool is_2d = false,
229 bool is_compound = false>
FilterHorizontal(const uint8_t * src,const ptrdiff_t src_stride,void * const dest,const ptrdiff_t pred_stride,const int width,const int height,const uint8x8_t * const v_tap)230 void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
231 void* const dest, const ptrdiff_t pred_stride,
232 const int width, const int height,
233 const uint8x8_t* const v_tap) {
234 auto* dest8 = static_cast<uint8_t*>(dest);
235 auto* dest16 = static_cast<uint16_t*>(dest);
236
237 // 4 tap filters are never used when width > 4.
238 if (num_taps != 4 && width > 4) {
239 int y = 0;
240 do {
241 int x = 0;
242 do {
243 if (is_2d || is_compound) {
244 const uint16x8_t v_sum =
245 HorizontalTaps8To16<filter_index, negative_outside_taps>(&src[x],
246 v_tap);
247 vst1q_u16(&dest16[x], v_sum);
248 } else {
249 const uint8x8_t result =
250 SimpleHorizontalTaps<filter_index, negative_outside_taps>(&src[x],
251 v_tap);
252 vst1_u8(&dest8[x], result);
253 }
254 x += step;
255 } while (x < width);
256 src += src_stride;
257 dest8 += pred_stride;
258 dest16 += pred_stride;
259 } while (++y < height);
260 return;
261 }
262
263 // Horizontal passes only needs to account for |num_taps| 2 and 4 when
264 // |width| <= 4.
265 assert(width <= 4);
266 assert(num_taps <= 4);
267 if (num_taps <= 4) {
268 if (width == 4) {
269 int y = 0;
270 do {
271 if (is_2d || is_compound) {
272 const uint16x8_t v_sum =
273 HorizontalTaps8To16<filter_index, negative_outside_taps>(src,
274 v_tap);
275 vst1_u16(dest16, vget_low_u16(v_sum));
276 } else {
277 const uint8x8_t result =
278 SimpleHorizontalTaps<filter_index, negative_outside_taps>(src,
279 v_tap);
280 StoreLo4(&dest8[0], result);
281 }
282 src += src_stride;
283 dest8 += pred_stride;
284 dest16 += pred_stride;
285 } while (++y < height);
286 return;
287 }
288
289 if (!is_compound) {
290 int y = 0;
291 do {
292 if (is_2d) {
293 const uint16x8_t sum =
294 HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
295 dest16[0] = vgetq_lane_u16(sum, 0);
296 dest16[1] = vgetq_lane_u16(sum, 2);
297 dest16 += pred_stride;
298 dest16[0] = vgetq_lane_u16(sum, 1);
299 dest16[1] = vgetq_lane_u16(sum, 3);
300 dest16 += pred_stride;
301 } else {
302 const uint8x8_t sum =
303 SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
304
305 dest8[0] = vget_lane_u8(sum, 0);
306 dest8[1] = vget_lane_u8(sum, 2);
307 dest8 += pred_stride;
308
309 dest8[0] = vget_lane_u8(sum, 1);
310 dest8[1] = vget_lane_u8(sum, 3);
311 dest8 += pred_stride;
312 }
313
314 src += src_stride << 1;
315 y += 2;
316 } while (y < height - 1);
317
318 // The 2d filters have an odd |height| because the horizontal pass
319 // generates context for the vertical pass.
320 if (is_2d) {
321 assert(height % 2 == 1);
322 uint16x8_t sum;
323 const uint8x8_t input = vld1_u8(src);
324 if (filter_index == 3) { // |num_taps| == 2
325 sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
326 sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
327 } else if (filter_index == 4) {
328 sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
329 sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
330 sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
331 sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
332 } else {
333 assert(filter_index == 5);
334 sum = vmull_u8(RightShift<2 * 8>(input), v_tap[2]);
335 sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
336 sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
337 sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
338 }
339 // |sum| contains an int16_t value.
340 sum = vreinterpretq_u16_s16(vrshrq_n_s16(
341 vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1));
342 Store2<0>(dest16, sum);
343 }
344 }
345 }
346 }
347
348 // Process 16 bit inputs and output 32 bits.
349 template <int num_taps, bool is_compound>
Sum2DVerticalTaps4(const int16x4_t * const src,const int16x8_t taps)350 inline int16x4_t Sum2DVerticalTaps4(const int16x4_t* const src,
351 const int16x8_t taps) {
352 const int16x4_t taps_lo = vget_low_s16(taps);
353 const int16x4_t taps_hi = vget_high_s16(taps);
354 int32x4_t sum;
355 if (num_taps == 8) {
356 sum = vmull_lane_s16(src[0], taps_lo, 0);
357 sum = vmlal_lane_s16(sum, src[1], taps_lo, 1);
358 sum = vmlal_lane_s16(sum, src[2], taps_lo, 2);
359 sum = vmlal_lane_s16(sum, src[3], taps_lo, 3);
360 sum = vmlal_lane_s16(sum, src[4], taps_hi, 0);
361 sum = vmlal_lane_s16(sum, src[5], taps_hi, 1);
362 sum = vmlal_lane_s16(sum, src[6], taps_hi, 2);
363 sum = vmlal_lane_s16(sum, src[7], taps_hi, 3);
364 } else if (num_taps == 6) {
365 sum = vmull_lane_s16(src[0], taps_lo, 1);
366 sum = vmlal_lane_s16(sum, src[1], taps_lo, 2);
367 sum = vmlal_lane_s16(sum, src[2], taps_lo, 3);
368 sum = vmlal_lane_s16(sum, src[3], taps_hi, 0);
369 sum = vmlal_lane_s16(sum, src[4], taps_hi, 1);
370 sum = vmlal_lane_s16(sum, src[5], taps_hi, 2);
371 } else if (num_taps == 4) {
372 sum = vmull_lane_s16(src[0], taps_lo, 2);
373 sum = vmlal_lane_s16(sum, src[1], taps_lo, 3);
374 sum = vmlal_lane_s16(sum, src[2], taps_hi, 0);
375 sum = vmlal_lane_s16(sum, src[3], taps_hi, 1);
376 } else if (num_taps == 2) {
377 sum = vmull_lane_s16(src[0], taps_lo, 3);
378 sum = vmlal_lane_s16(sum, src[1], taps_hi, 0);
379 }
380
381 if (is_compound) {
382 return vqrshrn_n_s32(sum, kInterRoundBitsCompoundVertical - 1);
383 }
384
385 return vqrshrn_n_s32(sum, kInterRoundBitsVertical - 1);
386 }
387
388 template <int num_taps, bool is_compound>
SimpleSum2DVerticalTaps(const int16x8_t * const src,const int16x8_t taps)389 int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
390 const int16x8_t taps) {
391 const int16x4_t taps_lo = vget_low_s16(taps);
392 const int16x4_t taps_hi = vget_high_s16(taps);
393 int32x4_t sum_lo, sum_hi;
394 if (num_taps == 8) {
395 sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0);
396 sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 0);
397 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1);
398 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1);
399 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2);
400 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 2);
401 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_lo, 3);
402 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_lo, 3);
403
404 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 0);
405 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 0);
406 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 1);
407 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 1);
408 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[6]), taps_hi, 2);
409 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[6]), taps_hi, 2);
410 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3);
411 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3);
412 } else if (num_taps == 6) {
413 sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1);
414 sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 1);
415 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2);
416 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2);
417 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3);
418 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 3);
419
420 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 0);
421 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 0);
422 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 1);
423 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 1);
424 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2);
425 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2);
426 } else if (num_taps == 4) {
427 sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 2);
428 sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 2);
429 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3);
430 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3);
431
432 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_hi, 0);
433 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_hi, 0);
434 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1);
435 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1);
436 } else if (num_taps == 2) {
437 sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 3);
438 sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 3);
439
440 sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0);
441 sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0);
442 }
443
444 if (is_compound) {
445 return vcombine_s16(
446 vqrshrn_n_s32(sum_lo, kInterRoundBitsCompoundVertical - 1),
447 vqrshrn_n_s32(sum_hi, kInterRoundBitsCompoundVertical - 1));
448 }
449
450 return vcombine_s16(vqrshrn_n_s32(sum_lo, kInterRoundBitsVertical - 1),
451 vqrshrn_n_s32(sum_hi, kInterRoundBitsVertical - 1));
452 }
453
454 template <int num_taps, bool is_compound = false>
Filter2DVertical(const uint16_t * src,void * const dst,const ptrdiff_t dst_stride,const int width,const int height,const int16x8_t taps)455 void Filter2DVertical(const uint16_t* src, void* const dst,
456 const ptrdiff_t dst_stride, const int width,
457 const int height, const int16x8_t taps) {
458 assert(width >= 8);
459 constexpr int next_row = num_taps - 1;
460 // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
461 const ptrdiff_t src_stride = width;
462
463 auto* dst8 = static_cast<uint8_t*>(dst);
464 auto* dst16 = static_cast<uint16_t*>(dst);
465
466 int x = 0;
467 do {
468 int16x8_t srcs[8];
469 const uint16_t* src_x = src + x;
470 srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src_x));
471 src_x += src_stride;
472 if (num_taps >= 4) {
473 srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src_x));
474 src_x += src_stride;
475 srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src_x));
476 src_x += src_stride;
477 if (num_taps >= 6) {
478 srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src_x));
479 src_x += src_stride;
480 srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src_x));
481 src_x += src_stride;
482 if (num_taps == 8) {
483 srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src_x));
484 src_x += src_stride;
485 srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src_x));
486 src_x += src_stride;
487 }
488 }
489 }
490
491 int y = 0;
492 do {
493 srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src_x));
494 src_x += src_stride;
495
496 const int16x8_t sum =
497 SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
498 if (is_compound) {
499 vst1q_u16(dst16 + x + y * dst_stride, vreinterpretq_u16_s16(sum));
500 } else {
501 vst1_u8(dst8 + x + y * dst_stride, vqmovun_s16(sum));
502 }
503
504 srcs[0] = srcs[1];
505 if (num_taps >= 4) {
506 srcs[1] = srcs[2];
507 srcs[2] = srcs[3];
508 if (num_taps >= 6) {
509 srcs[3] = srcs[4];
510 srcs[4] = srcs[5];
511 if (num_taps == 8) {
512 srcs[5] = srcs[6];
513 srcs[6] = srcs[7];
514 }
515 }
516 }
517 } while (++y < height);
518 x += 8;
519 } while (x < width);
520 }
521
522 // Take advantage of |src_stride| == |width| to process two rows at a time.
523 template <int num_taps, bool is_compound = false>
Filter2DVertical4xH(const uint16_t * src,void * const dst,const ptrdiff_t dst_stride,const int height,const int16x8_t taps)524 void Filter2DVertical4xH(const uint16_t* src, void* const dst,
525 const ptrdiff_t dst_stride, const int height,
526 const int16x8_t taps) {
527 auto* dst8 = static_cast<uint8_t*>(dst);
528 auto* dst16 = static_cast<uint16_t*>(dst);
529
530 int16x8_t srcs[9];
531 srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
532 src += 8;
533 if (num_taps >= 4) {
534 srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
535 src += 8;
536 srcs[1] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[2]));
537 if (num_taps >= 6) {
538 srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
539 src += 8;
540 srcs[3] = vcombine_s16(vget_high_s16(srcs[2]), vget_low_s16(srcs[4]));
541 if (num_taps == 8) {
542 srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
543 src += 8;
544 srcs[5] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[6]));
545 }
546 }
547 }
548
549 int y = 0;
550 do {
551 srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
552 src += 8;
553 srcs[num_taps - 1] = vcombine_s16(vget_high_s16(srcs[num_taps - 2]),
554 vget_low_s16(srcs[num_taps]));
555
556 const int16x8_t sum =
557 SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
558 if (is_compound) {
559 const uint16x8_t results = vreinterpretq_u16_s16(sum);
560 vst1q_u16(dst16, results);
561 dst16 += 4 << 1;
562 } else {
563 const uint8x8_t results = vqmovun_s16(sum);
564
565 StoreLo4(dst8, results);
566 dst8 += dst_stride;
567 StoreHi4(dst8, results);
568 dst8 += dst_stride;
569 }
570
571 srcs[0] = srcs[2];
572 if (num_taps >= 4) {
573 srcs[1] = srcs[3];
574 srcs[2] = srcs[4];
575 if (num_taps >= 6) {
576 srcs[3] = srcs[5];
577 srcs[4] = srcs[6];
578 if (num_taps == 8) {
579 srcs[5] = srcs[7];
580 srcs[6] = srcs[8];
581 }
582 }
583 }
584 y += 2;
585 } while (y < height);
586 }
587
588 // Take advantage of |src_stride| == |width| to process four rows at a time.
589 template <int num_taps>
Filter2DVertical2xH(const uint16_t * src,void * const dst,const ptrdiff_t dst_stride,const int height,const int16x8_t taps)590 void Filter2DVertical2xH(const uint16_t* src, void* const dst,
591 const ptrdiff_t dst_stride, const int height,
592 const int16x8_t taps) {
593 constexpr int next_row = (num_taps < 6) ? 4 : 8;
594
595 auto* dst8 = static_cast<uint8_t*>(dst);
596
597 int16x8_t srcs[9];
598 srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
599 src += 8;
600 if (num_taps >= 6) {
601 srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
602 src += 8;
603 srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
604 if (num_taps == 8) {
605 srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
606 srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
607 }
608 }
609
610 int y = 0;
611 do {
612 srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
613 src += 8;
614 if (num_taps == 2) {
615 srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
616 } else if (num_taps == 4) {
617 srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
618 srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
619 srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
620 } else if (num_taps == 6) {
621 srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
622 srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
623 srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
624 } else if (num_taps == 8) {
625 srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
626 srcs[6] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[8]));
627 srcs[7] = vextq_s16(srcs[4], srcs[8], 6);
628 }
629
630 const int16x8_t sum =
631 SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
632 const uint8x8_t results = vqmovun_s16(sum);
633
634 Store2<0>(dst8, results);
635 dst8 += dst_stride;
636 Store2<1>(dst8, results);
637 // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
638 // Therefore we don't need to check this condition when |height| > 4.
639 if (num_taps <= 4 && height == 2) return;
640 dst8 += dst_stride;
641 Store2<2>(dst8, results);
642 dst8 += dst_stride;
643 Store2<3>(dst8, results);
644 dst8 += dst_stride;
645
646 srcs[0] = srcs[4];
647 if (num_taps == 6) {
648 srcs[1] = srcs[5];
649 srcs[4] = srcs[8];
650 } else if (num_taps == 8) {
651 srcs[1] = srcs[5];
652 srcs[2] = srcs[6];
653 srcs[3] = srcs[7];
654 srcs[4] = srcs[8];
655 }
656
657 y += 4;
658 } while (y < height);
659 }
660
661 template <bool is_2d = false, bool is_compound = false>
DoHorizontalPass(const uint8_t * const src,const ptrdiff_t src_stride,void * const dst,const ptrdiff_t dst_stride,const int width,const int height,const int filter_id,const int filter_index)662 LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
663 const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
664 const ptrdiff_t dst_stride, const int width, const int height,
665 const int filter_id, const int filter_index) {
666 // Duplicate the absolute value for each tap. Negative taps are corrected
667 // by using the vmlsl_u8 instruction. Positive taps use vmlal_u8.
668 uint8x8_t v_tap[kSubPixelTaps];
669 assert(filter_id != 0);
670
671 for (int k = 0; k < kSubPixelTaps; ++k) {
672 v_tap[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
673 }
674
675 if (filter_index == 2) { // 8 tap.
676 FilterHorizontal<8, 8, 2, true, is_2d, is_compound>(
677 src, src_stride, dst, dst_stride, width, height, v_tap);
678 } else if (filter_index == 1) { // 6 tap.
679 // Check if outside taps are positive.
680 if ((filter_id == 1) | (filter_id == 15)) {
681 FilterHorizontal<6, 8, 1, false, is_2d, is_compound>(
682 src, src_stride, dst, dst_stride, width, height, v_tap);
683 } else {
684 FilterHorizontal<6, 8, 1, true, is_2d, is_compound>(
685 src, src_stride, dst, dst_stride, width, height, v_tap);
686 }
687 } else if (filter_index == 0) { // 6 tap.
688 FilterHorizontal<6, 8, 0, true, is_2d, is_compound>(
689 src, src_stride, dst, dst_stride, width, height, v_tap);
690 } else if (filter_index == 4) { // 4 tap.
691 FilterHorizontal<4, 8, 4, true, is_2d, is_compound>(
692 src, src_stride, dst, dst_stride, width, height, v_tap);
693 } else if (filter_index == 5) { // 4 tap.
694 FilterHorizontal<4, 8, 5, true, is_2d, is_compound>(
695 src, src_stride, dst, dst_stride, width, height, v_tap);
696 } else { // 2 tap.
697 FilterHorizontal<2, 8, 3, true, is_2d, is_compound>(
698 src, src_stride, dst, dst_stride, width, height, v_tap);
699 }
700 }
701
GetNumTapsInFilter(const int filter_index)702 int GetNumTapsInFilter(const int filter_index) {
703 if (filter_index < 2) {
704 // Despite the names these only use 6 taps.
705 // kInterpolationFilterEightTap
706 // kInterpolationFilterEightTapSmooth
707 return 6;
708 }
709
710 if (filter_index == 2) {
711 // kInterpolationFilterEightTapSharp
712 return 8;
713 }
714
715 if (filter_index == 3) {
716 // kInterpolationFilterBilinear
717 return 2;
718 }
719
720 assert(filter_index > 3);
721 // For small sizes (width/height <= 4) the large filters are replaced with 4
722 // tap options.
723 // If the original filters were |kInterpolationFilterEightTap| or
724 // |kInterpolationFilterEightTapSharp| then it becomes
725 // |kInterpolationFilterSwitchable|.
726 // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
727 // tap filter.
728 return 4;
729 }
730
Convolve2D_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int horizontal_filter_id,const int vertical_filter_id,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)731 void Convolve2D_NEON(const void* const reference,
732 const ptrdiff_t reference_stride,
733 const int horizontal_filter_index,
734 const int vertical_filter_index,
735 const int horizontal_filter_id,
736 const int vertical_filter_id, const int width,
737 const int height, void* prediction,
738 const ptrdiff_t pred_stride) {
739 const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
740 const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
741 const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
742
743 // The output of the horizontal filter is guaranteed to fit in 16 bits.
744 uint16_t
745 intermediate_result[kMaxSuperBlockSizeInPixels *
746 (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
747 const int intermediate_height = height + vertical_taps - 1;
748
749 const ptrdiff_t src_stride = reference_stride;
750 const auto* src = static_cast<const uint8_t*>(reference) -
751 (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
752
753 DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
754 width, intermediate_height,
755 horizontal_filter_id, horiz_filter_index);
756
757 // Vertical filter.
758 auto* dest = static_cast<uint8_t*>(prediction);
759 const ptrdiff_t dest_stride = pred_stride;
760 assert(vertical_filter_id != 0);
761
762 const int16x8_t taps = vmovl_s8(
763 vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
764
765 if (vertical_taps == 8) {
766 if (width == 2) {
767 Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
768 taps);
769 } else if (width == 4) {
770 Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
771 taps);
772 } else {
773 Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
774 taps);
775 }
776 } else if (vertical_taps == 6) {
777 if (width == 2) {
778 Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
779 taps);
780 } else if (width == 4) {
781 Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
782 taps);
783 } else {
784 Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
785 taps);
786 }
787 } else if (vertical_taps == 4) {
788 if (width == 2) {
789 Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
790 taps);
791 } else if (width == 4) {
792 Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
793 taps);
794 } else {
795 Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
796 taps);
797 }
798 } else { // |vertical_taps| == 2
799 if (width == 2) {
800 Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
801 taps);
802 } else if (width == 4) {
803 Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
804 taps);
805 } else {
806 Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
807 taps);
808 }
809 }
810 }
811
812 // There are many opportunities for overreading in scaled convolve, because the
813 // range of starting points for filter windows is anywhere from 0 to 16 for 8
814 // destination pixels, and the window sizes range from 2 to 8. To accommodate
815 // this range concisely, we use |grade_x| to mean the most steps in src that can
816 // be traversed in a single |step_x| increment, i.e. 1 or 2. When grade_x is 2,
817 // we are guaranteed to exceed 8 whole steps in src for every 8 |step_x|
818 // increments. The first load covers the initial elements of src_x, while the
819 // final load covers the taps.
820 template <int grade_x>
LoadSrcVals(const uint8_t * src_x)821 inline uint8x8x3_t LoadSrcVals(const uint8_t* src_x) {
822 uint8x8x3_t ret;
823 const uint8x16_t src_val = vld1q_u8(src_x);
824 ret.val[0] = vget_low_u8(src_val);
825 ret.val[1] = vget_high_u8(src_val);
826 if (grade_x > 1) {
827 ret.val[2] = vld1_u8(src_x + 16);
828 }
829 return ret;
830 }
831
832 // Pre-transpose the 2 tap filters in |kAbsHalfSubPixelFilters|[3]
GetPositive2TapFilter(const int tap_index)833 inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
834 assert(tap_index < 2);
835 alignas(
836 16) static constexpr uint8_t kAbsHalfSubPixel2TapFilterColumns[2][16] = {
837 {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
838 {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
839
840 return vld1q_u8(kAbsHalfSubPixel2TapFilterColumns[tap_index]);
841 }
842
843 template <int grade_x>
ConvolveKernelHorizontal2Tap(const uint8_t * src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)844 inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
845 const ptrdiff_t src_stride,
846 const int width, const int subpixel_x,
847 const int step_x,
848 const int intermediate_height,
849 int16_t* intermediate) {
850 // Account for the 0-taps that precede the 2 nonzero taps.
851 const int kernel_offset = 3;
852 const int ref_x = subpixel_x >> kScaleSubPixelBits;
853 const int step_x8 = step_x << 3;
854 const uint8x16_t filter_taps0 = GetPositive2TapFilter(0);
855 const uint8x16_t filter_taps1 = GetPositive2TapFilter(1);
856 const uint16x8_t index_steps = vmulq_n_u16(
857 vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
858 const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
859
860 int p = subpixel_x;
861 if (width <= 4) {
862 const uint8_t* src_x =
863 &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
864 // Only add steps to the 10-bit truncated p to avoid overflow.
865 const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
866 const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
867 const uint8x8_t filter_indices =
868 vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
869 // This is a special case. The 2-tap filter has no negative taps, so we
870 // can use unsigned values.
871 // For each x, a lane of tapsK has
872 // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
873 // on x.
874 const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
875 VQTbl1U8(filter_taps1, filter_indices)};
876 int y = 0;
877 do {
878 // Load a pool of samples to select from using stepped indices.
879 const uint8x16_t src_vals = vld1q_u8(src_x);
880 const uint8x8_t src_indices =
881 vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
882
883 // For each x, a lane of srcK contains src_x[k].
884 const uint8x8_t src[2] = {
885 VQTbl1U8(src_vals, src_indices),
886 VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
887
888 vst1q_s16(intermediate,
889 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
890 kInterRoundBitsHorizontal - 1));
891 src_x += src_stride;
892 intermediate += kIntermediateStride;
893 } while (++y < intermediate_height);
894 return;
895 }
896
897 // |width| >= 8
898 int x = 0;
899 do {
900 const uint8_t* src_x =
901 &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
902 int16_t* intermediate_x = intermediate + x;
903 // Only add steps to the 10-bit truncated p to avoid overflow.
904 const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
905 const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
906 const uint8x8_t filter_indices =
907 vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
908 filter_index_mask);
909 // This is a special case. The 2-tap filter has no negative taps, so we
910 // can use unsigned values.
911 // For each x, a lane of tapsK has
912 // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
913 // on x.
914 const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
915 VQTbl1U8(filter_taps1, filter_indices)};
916 int y = 0;
917 do {
918 // Load a pool of samples to select from using stepped indices.
919 const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
920 const uint8x8_t src_indices =
921 vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
922
923 // For each x, a lane of srcK contains src_x[k].
924 const uint8x8_t src[2] = {
925 vtbl3_u8(src_vals, src_indices),
926 vtbl3_u8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
927
928 vst1q_s16(intermediate_x,
929 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
930 kInterRoundBitsHorizontal - 1));
931 src_x += src_stride;
932 intermediate_x += kIntermediateStride;
933 } while (++y < intermediate_height);
934 x += 8;
935 p += step_x8;
936 } while (x < width);
937 }
938
939 // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[5].
GetPositive4TapFilter(const int tap_index)940 inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
941 assert(tap_index < 4);
942 alignas(
943 16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
944 {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
945 {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
946 {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
947 {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
948
949 return vld1q_u8(kSubPixel4TapPositiveFilterColumns[tap_index]);
950 }
951
952 // This filter is only possible when width <= 4.
ConvolveKernelHorizontalPositive4Tap(const uint8_t * src,const ptrdiff_t src_stride,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)953 void ConvolveKernelHorizontalPositive4Tap(
954 const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
955 const int step_x, const int intermediate_height, int16_t* intermediate) {
956 const int kernel_offset = 2;
957 const int ref_x = subpixel_x >> kScaleSubPixelBits;
958 const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
959 const uint8x16_t filter_taps0 = GetPositive4TapFilter(0);
960 const uint8x16_t filter_taps1 = GetPositive4TapFilter(1);
961 const uint8x16_t filter_taps2 = GetPositive4TapFilter(2);
962 const uint8x16_t filter_taps3 = GetPositive4TapFilter(3);
963 const uint16x8_t index_steps = vmulq_n_u16(
964 vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
965 const int p = subpixel_x;
966 // First filter is special, just a 128 tap on the center.
967 const uint8_t* src_x =
968 &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
969 // Only add steps to the 10-bit truncated p to avoid overflow.
970 const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
971 const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
972 const uint8x8_t filter_indices = vand_u8(
973 vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), filter_index_mask);
974 // Note that filter_id depends on x.
975 // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
976 const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
977 VQTbl1U8(filter_taps1, filter_indices),
978 VQTbl1U8(filter_taps2, filter_indices),
979 VQTbl1U8(filter_taps3, filter_indices)};
980
981 const uint8x8_t src_indices =
982 vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
983 int y = 0;
984 do {
985 // Load a pool of samples to select from using stepped index vectors.
986 const uint8x16_t src_vals = vld1q_u8(src_x);
987
988 // For each x, srcK contains src_x[k] where k=1.
989 // Whereas taps come from different arrays, src pixels are drawn from the
990 // same contiguous line.
991 const uint8x8_t src[4] = {
992 VQTbl1U8(src_vals, src_indices),
993 VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1))),
994 VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2))),
995 VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)))};
996
997 vst1q_s16(intermediate,
998 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/5>(src, taps),
999 kInterRoundBitsHorizontal - 1));
1000
1001 src_x += src_stride;
1002 intermediate += kIntermediateStride;
1003 } while (++y < intermediate_height);
1004 }
1005
1006 // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
GetSigned4TapFilter(const int tap_index)1007 inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
1008 assert(tap_index < 4);
1009 alignas(16) static constexpr uint8_t
1010 kAbsHalfSubPixel4TapSignedFilterColumns[4][16] = {
1011 {0, 2, 4, 5, 6, 6, 7, 6, 6, 5, 5, 5, 4, 3, 2, 1},
1012 {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
1013 {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
1014 {0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 7, 6, 6, 5, 4, 2}};
1015
1016 return vld1q_u8(kAbsHalfSubPixel4TapSignedFilterColumns[tap_index]);
1017 }
1018
1019 // This filter is only possible when width <= 4.
ConvolveKernelHorizontalSigned4Tap(const uint8_t * src,const ptrdiff_t src_stride,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)1020 inline void ConvolveKernelHorizontalSigned4Tap(
1021 const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
1022 const int step_x, const int intermediate_height, int16_t* intermediate) {
1023 const int kernel_offset = 2;
1024 const int ref_x = subpixel_x >> kScaleSubPixelBits;
1025 const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1026 const uint8x16_t filter_taps0 = GetSigned4TapFilter(0);
1027 const uint8x16_t filter_taps1 = GetSigned4TapFilter(1);
1028 const uint8x16_t filter_taps2 = GetSigned4TapFilter(2);
1029 const uint8x16_t filter_taps3 = GetSigned4TapFilter(3);
1030 const uint16x4_t index_steps = vmul_n_u16(vcreate_u16(0x0003000200010000),
1031 static_cast<uint16_t>(step_x));
1032
1033 const int p = subpixel_x;
1034 const uint8_t* src_x =
1035 &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
1036 // Only add steps to the 10-bit truncated p to avoid overflow.
1037 const uint16x4_t p_fraction = vdup_n_u16(p & 1023);
1038 const uint16x4_t subpel_index_offsets = vadd_u16(index_steps, p_fraction);
1039 const uint8x8_t filter_index_offsets = vshrn_n_u16(
1040 vcombine_u16(subpel_index_offsets, vdup_n_u16(0)), kFilterIndexShift);
1041 const uint8x8_t filter_indices =
1042 vand_u8(filter_index_offsets, filter_index_mask);
1043 // Note that filter_id depends on x.
1044 // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
1045 const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
1046 VQTbl1U8(filter_taps1, filter_indices),
1047 VQTbl1U8(filter_taps2, filter_indices),
1048 VQTbl1U8(filter_taps3, filter_indices)};
1049
1050 const uint8x8_t src_indices_base =
1051 vshr_n_u8(filter_index_offsets, kScaleSubPixelBits - kFilterIndexShift);
1052
1053 const uint8x8_t src_indices[4] = {src_indices_base,
1054 vadd_u8(src_indices_base, vdup_n_u8(1)),
1055 vadd_u8(src_indices_base, vdup_n_u8(2)),
1056 vadd_u8(src_indices_base, vdup_n_u8(3))};
1057
1058 int y = 0;
1059 do {
1060 // Load a pool of samples to select from using stepped indices.
1061 const uint8x16_t src_vals = vld1q_u8(src_x);
1062
1063 // For each x, srcK contains src_x[k] where k=1.
1064 // Whereas taps come from different arrays, src pixels are drawn from the
1065 // same contiguous line.
1066 const uint8x8_t src[4] = {
1067 VQTbl1U8(src_vals, src_indices[0]), VQTbl1U8(src_vals, src_indices[1]),
1068 VQTbl1U8(src_vals, src_indices[2]), VQTbl1U8(src_vals, src_indices[3])};
1069
1070 vst1q_s16(intermediate,
1071 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/4>(src, taps),
1072 kInterRoundBitsHorizontal - 1));
1073 src_x += src_stride;
1074 intermediate += kIntermediateStride;
1075 } while (++y < intermediate_height);
1076 }
1077
1078 // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
GetSigned6TapFilter(const int tap_index)1079 inline uint8x16_t GetSigned6TapFilter(const int tap_index) {
1080 assert(tap_index < 6);
1081 alignas(16) static constexpr uint8_t
1082 kAbsHalfSubPixel6TapSignedFilterColumns[6][16] = {
1083 {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
1084 {0, 3, 5, 6, 7, 7, 8, 7, 7, 6, 6, 6, 5, 4, 2, 1},
1085 {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
1086 {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
1087 {0, 1, 2, 4, 5, 6, 6, 6, 7, 7, 8, 7, 7, 6, 5, 3},
1088 {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
1089
1090 return vld1q_u8(kAbsHalfSubPixel6TapSignedFilterColumns[tap_index]);
1091 }
1092
1093 // This filter is only possible when width >= 8.
1094 template <int grade_x>
ConvolveKernelHorizontalSigned6Tap(const uint8_t * src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)1095 inline void ConvolveKernelHorizontalSigned6Tap(
1096 const uint8_t* src, const ptrdiff_t src_stride, const int width,
1097 const int subpixel_x, const int step_x, const int intermediate_height,
1098 int16_t* intermediate) {
1099 const int kernel_offset = 1;
1100 const uint8x8_t one = vdup_n_u8(1);
1101 const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1102 const int ref_x = subpixel_x >> kScaleSubPixelBits;
1103 const int step_x8 = step_x << 3;
1104 uint8x16_t filter_taps[6];
1105 for (int i = 0; i < 6; ++i) {
1106 filter_taps[i] = GetSigned6TapFilter(i);
1107 }
1108 const uint16x8_t index_steps = vmulq_n_u16(
1109 vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
1110
1111 int x = 0;
1112 int p = subpixel_x;
1113 do {
1114 // Avoid overloading outside the reference boundaries. This means
1115 // |trailing_width| can be up to 24.
1116 const uint8_t* src_x =
1117 &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
1118 int16_t* intermediate_x = intermediate + x;
1119 // Only add steps to the 10-bit truncated p to avoid overflow.
1120 const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
1121 const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
1122 const uint8x8_t src_indices =
1123 vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
1124 uint8x8_t src_lookup[6];
1125 src_lookup[0] = src_indices;
1126 for (int i = 1; i < 6; ++i) {
1127 src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
1128 }
1129
1130 const uint8x8_t filter_indices =
1131 vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
1132 filter_index_mask);
1133 // For each x, a lane of taps[k] has
1134 // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
1135 // on x.
1136 uint8x8_t taps[6];
1137 for (int i = 0; i < 6; ++i) {
1138 taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
1139 }
1140 int y = 0;
1141 do {
1142 // Load a pool of samples to select from using stepped indices.
1143 const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
1144
1145 const uint8x8_t src[6] = {
1146 vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
1147 vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
1148 vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5])};
1149
1150 vst1q_s16(intermediate_x,
1151 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/0>(src, taps),
1152 kInterRoundBitsHorizontal - 1));
1153 src_x += src_stride;
1154 intermediate_x += kIntermediateStride;
1155 } while (++y < intermediate_height);
1156 x += 8;
1157 p += step_x8;
1158 } while (x < width);
1159 }
1160
1161 // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[1]. This filter
1162 // has mixed positive and negative outer taps which are handled in
1163 // GetMixed6TapFilter().
GetPositive6TapFilter(const int tap_index)1164 inline uint8x16_t GetPositive6TapFilter(const int tap_index) {
1165 assert(tap_index < 6);
1166 alignas(16) static constexpr uint8_t
1167 kAbsHalfSubPixel6TapPositiveFilterColumns[4][16] = {
1168 {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
1169 {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
1170 {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
1171 {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14}};
1172
1173 return vld1q_u8(kAbsHalfSubPixel6TapPositiveFilterColumns[tap_index]);
1174 }
1175
GetMixed6TapFilter(const int tap_index)1176 inline int8x16_t GetMixed6TapFilter(const int tap_index) {
1177 assert(tap_index < 2);
1178 alignas(
1179 16) static constexpr int8_t kHalfSubPixel6TapMixedFilterColumns[2][16] = {
1180 {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
1181 {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
1182
1183 return vld1q_s8(kHalfSubPixel6TapMixedFilterColumns[tap_index]);
1184 }
1185
1186 // This filter is only possible when width >= 8.
1187 template <int grade_x>
ConvolveKernelHorizontalMixed6Tap(const uint8_t * src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)1188 inline void ConvolveKernelHorizontalMixed6Tap(
1189 const uint8_t* src, const ptrdiff_t src_stride, const int width,
1190 const int subpixel_x, const int step_x, const int intermediate_height,
1191 int16_t* intermediate) {
1192 const int kernel_offset = 1;
1193 const uint8x8_t one = vdup_n_u8(1);
1194 const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1195 const int ref_x = subpixel_x >> kScaleSubPixelBits;
1196 const int step_x8 = step_x << 3;
1197 uint8x8_t taps[4];
1198 int16x8_t mixed_taps[2];
1199 uint8x16_t positive_filter_taps[4];
1200 for (int i = 0; i < 4; ++i) {
1201 positive_filter_taps[i] = GetPositive6TapFilter(i);
1202 }
1203 int8x16_t mixed_filter_taps[2];
1204 mixed_filter_taps[0] = GetMixed6TapFilter(0);
1205 mixed_filter_taps[1] = GetMixed6TapFilter(1);
1206 const uint16x8_t index_steps = vmulq_n_u16(
1207 vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
1208
1209 int x = 0;
1210 int p = subpixel_x;
1211 do {
1212 const uint8_t* src_x =
1213 &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
1214 int16_t* intermediate_x = intermediate + x;
1215 // Only add steps to the 10-bit truncated p to avoid overflow.
1216 const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
1217 const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
1218 const uint8x8_t src_indices =
1219 vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
1220 uint8x8_t src_lookup[6];
1221 src_lookup[0] = src_indices;
1222 for (int i = 1; i < 6; ++i) {
1223 src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
1224 }
1225
1226 const uint8x8_t filter_indices =
1227 vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
1228 filter_index_mask);
1229 // For each x, a lane of taps[k] has
1230 // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
1231 // on x.
1232 for (int i = 0; i < 4; ++i) {
1233 taps[i] = VQTbl1U8(positive_filter_taps[i], filter_indices);
1234 }
1235 mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
1236 mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
1237
1238 int y = 0;
1239 do {
1240 // Load a pool of samples to select from using stepped indices.
1241 const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
1242
1243 int16x8_t sum_mixed = vmulq_s16(
1244 mixed_taps[0], ZeroExtend(vtbl3_u8(src_vals, src_lookup[0])));
1245 sum_mixed = vmlaq_s16(sum_mixed, mixed_taps[1],
1246 ZeroExtend(vtbl3_u8(src_vals, src_lookup[5])));
1247 uint16x8_t sum = vreinterpretq_u16_s16(sum_mixed);
1248 sum = vmlal_u8(sum, taps[0], vtbl3_u8(src_vals, src_lookup[1]));
1249 sum = vmlal_u8(sum, taps[1], vtbl3_u8(src_vals, src_lookup[2]));
1250 sum = vmlal_u8(sum, taps[2], vtbl3_u8(src_vals, src_lookup[3]));
1251 sum = vmlal_u8(sum, taps[3], vtbl3_u8(src_vals, src_lookup[4]));
1252
1253 vst1q_s16(intermediate_x, vrshrq_n_s16(vreinterpretq_s16_u16(sum),
1254 kInterRoundBitsHorizontal - 1));
1255 src_x += src_stride;
1256 intermediate_x += kIntermediateStride;
1257 } while (++y < intermediate_height);
1258 x += 8;
1259 p += step_x8;
1260 } while (x < width);
1261 }
1262
1263 // Pre-transpose the 8 tap filters in |kAbsHalfSubPixelFilters|[2].
GetSigned8TapFilter(const int tap_index)1264 inline uint8x16_t GetSigned8TapFilter(const int tap_index) {
1265 assert(tap_index < 8);
1266 alignas(16) static constexpr uint8_t
1267 kAbsHalfSubPixel8TapSignedFilterColumns[8][16] = {
1268 {0, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0},
1269 {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
1270 {0, 3, 6, 9, 11, 11, 12, 12, 12, 11, 10, 9, 7, 5, 3, 1},
1271 {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
1272 {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
1273 {0, 1, 3, 5, 7, 9, 10, 11, 12, 12, 12, 11, 11, 9, 6, 3},
1274 {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
1275 {0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1}};
1276
1277 return vld1q_u8(kAbsHalfSubPixel8TapSignedFilterColumns[tap_index]);
1278 }
1279
1280 // This filter is only possible when width >= 8.
1281 template <int grade_x>
ConvolveKernelHorizontalSigned8Tap(const uint8_t * src,const ptrdiff_t src_stride,const int width,const int subpixel_x,const int step_x,const int intermediate_height,int16_t * intermediate)1282 inline void ConvolveKernelHorizontalSigned8Tap(
1283 const uint8_t* src, const ptrdiff_t src_stride, const int width,
1284 const int subpixel_x, const int step_x, const int intermediate_height,
1285 int16_t* intermediate) {
1286 const uint8x8_t one = vdup_n_u8(1);
1287 const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
1288 const int ref_x = subpixel_x >> kScaleSubPixelBits;
1289 const int step_x8 = step_x << 3;
1290 uint8x8_t taps[8];
1291 uint8x16_t filter_taps[8];
1292 for (int i = 0; i < 8; ++i) {
1293 filter_taps[i] = GetSigned8TapFilter(i);
1294 }
1295 const uint16x8_t index_steps = vmulq_n_u16(
1296 vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
1297 int x = 0;
1298 int p = subpixel_x;
1299 do {
1300 const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
1301 int16_t* intermediate_x = intermediate + x;
1302 // Only add steps to the 10-bit truncated p to avoid overflow.
1303 const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
1304 const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
1305 const uint8x8_t src_indices =
1306 vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
1307 uint8x8_t src_lookup[8];
1308 src_lookup[0] = src_indices;
1309 for (int i = 1; i < 8; ++i) {
1310 src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
1311 }
1312
1313 const uint8x8_t filter_indices =
1314 vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
1315 filter_index_mask);
1316 // For each x, a lane of taps[k] has
1317 // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
1318 // on x.
1319 for (int i = 0; i < 8; ++i) {
1320 taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
1321 }
1322
1323 int y = 0;
1324 do {
1325 // Load a pool of samples to select from using stepped indices.
1326 const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
1327
1328 const uint8x8_t src[8] = {
1329 vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
1330 vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
1331 vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5]),
1332 vtbl3_u8(src_vals, src_lookup[6]), vtbl3_u8(src_vals, src_lookup[7])};
1333
1334 vst1q_s16(intermediate_x,
1335 vrshrq_n_s16(SumOnePassTaps</*filter_index=*/2>(src, taps),
1336 kInterRoundBitsHorizontal - 1));
1337 src_x += src_stride;
1338 intermediate_x += kIntermediateStride;
1339 } while (++y < intermediate_height);
1340 x += 8;
1341 p += step_x8;
1342 } while (x < width);
1343 }
1344
1345 // This function handles blocks of width 2 or 4.
1346 template <int num_taps, int grade_y, int width, bool is_compound>
ConvolveVerticalScale4xH(const int16_t * src,const int subpixel_y,const int filter_index,const int step_y,const int height,void * dest,const ptrdiff_t dest_stride)1347 void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
1348 const int filter_index, const int step_y,
1349 const int height, void* dest,
1350 const ptrdiff_t dest_stride) {
1351 constexpr ptrdiff_t src_stride = kIntermediateStride;
1352 const int16_t* src_y = src;
1353 // |dest| is 16-bit in compound mode, Pixel otherwise.
1354 uint16_t* dest16_y = static_cast<uint16_t*>(dest);
1355 uint8_t* dest_y = static_cast<uint8_t*>(dest);
1356 int16x4_t s[num_taps + grade_y];
1357
1358 int p = subpixel_y & 1023;
1359 int prev_p = p;
1360 int y = 0;
1361 do { // y < height
1362 for (int i = 0; i < num_taps; ++i) {
1363 s[i] = vld1_s16(src_y + i * src_stride);
1364 }
1365 int filter_id = (p >> 6) & kSubPixelMask;
1366 int16x8_t filter =
1367 vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1368 int16x4_t sums = Sum2DVerticalTaps4<num_taps, is_compound>(s, filter);
1369 if (is_compound) {
1370 assert(width != 2);
1371 const uint16x4_t result = vreinterpret_u16_s16(sums);
1372 vst1_u16(dest16_y, result);
1373 } else {
1374 const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
1375 if (width == 2) {
1376 Store2<0>(dest_y, result);
1377 } else {
1378 StoreLo4(dest_y, result);
1379 }
1380 }
1381 p += step_y;
1382 const int p_diff =
1383 (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
1384 prev_p = p;
1385 // Here we load extra source in case it is needed. If |p_diff| == 0, these
1386 // values will be unused, but it's faster to load than to branch.
1387 s[num_taps] = vld1_s16(src_y + num_taps * src_stride);
1388 if (grade_y > 1) {
1389 s[num_taps + 1] = vld1_s16(src_y + (num_taps + 1) * src_stride);
1390 }
1391 dest16_y += dest_stride;
1392 dest_y += dest_stride;
1393
1394 filter_id = (p >> 6) & kSubPixelMask;
1395 filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1396 sums = Sum2DVerticalTaps4<num_taps, is_compound>(&s[p_diff], filter);
1397 if (is_compound) {
1398 assert(width != 2);
1399 const uint16x4_t result = vreinterpret_u16_s16(sums);
1400 vst1_u16(dest16_y, result);
1401 } else {
1402 const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
1403 if (width == 2) {
1404 Store2<0>(dest_y, result);
1405 } else {
1406 StoreLo4(dest_y, result);
1407 }
1408 }
1409 p += step_y;
1410 src_y = src + (p >> kScaleSubPixelBits) * src_stride;
1411 prev_p = p;
1412 dest16_y += dest_stride;
1413 dest_y += dest_stride;
1414
1415 y += 2;
1416 } while (y < height);
1417 }
1418
1419 template <int num_taps, int grade_y, bool is_compound>
ConvolveVerticalScale(const int16_t * src,const int width,const int subpixel_y,const int filter_index,const int step_y,const int height,void * dest,const ptrdiff_t dest_stride)1420 inline void ConvolveVerticalScale(const int16_t* src, const int width,
1421 const int subpixel_y, const int filter_index,
1422 const int step_y, const int height,
1423 void* dest, const ptrdiff_t dest_stride) {
1424 constexpr ptrdiff_t src_stride = kIntermediateStride;
1425 // A possible improvement is to use arithmetic to decide how many times to
1426 // apply filters to same source before checking whether to load new srcs.
1427 // However, this will only improve performance with very small step sizes.
1428 int16x8_t s[num_taps + grade_y];
1429 // |dest| is 16-bit in compound mode, Pixel otherwise.
1430 uint16_t* dest16_y;
1431 uint8_t* dest_y;
1432
1433 int x = 0;
1434 do { // x < width
1435 const int16_t* src_x = src + x;
1436 const int16_t* src_y = src_x;
1437 dest16_y = static_cast<uint16_t*>(dest) + x;
1438 dest_y = static_cast<uint8_t*>(dest) + x;
1439 int p = subpixel_y & 1023;
1440 int prev_p = p;
1441 int y = 0;
1442 do { // y < height
1443 for (int i = 0; i < num_taps; ++i) {
1444 s[i] = vld1q_s16(src_y + i * src_stride);
1445 }
1446 int filter_id = (p >> 6) & kSubPixelMask;
1447 int16x8_t filter =
1448 vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1449 int16x8_t sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(s, filter);
1450 if (is_compound) {
1451 vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
1452 } else {
1453 vst1_u8(dest_y, vqmovun_s16(sum));
1454 }
1455 p += step_y;
1456 const int p_diff =
1457 (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
1458 // |grade_y| > 1 always means p_diff > 0, so load vectors that may be
1459 // needed. Otherwise, we only need to load one vector because |p_diff|
1460 // can't exceed 1.
1461 s[num_taps] = vld1q_s16(src_y + num_taps * src_stride);
1462 if (grade_y > 1) {
1463 s[num_taps + 1] = vld1q_s16(src_y + (num_taps + 1) * src_stride);
1464 }
1465 dest16_y += dest_stride;
1466 dest_y += dest_stride;
1467
1468 filter_id = (p >> 6) & kSubPixelMask;
1469 filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
1470 sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(&s[p_diff], filter);
1471 if (is_compound) {
1472 vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
1473 } else {
1474 vst1_u8(dest_y, vqmovun_s16(sum));
1475 }
1476 p += step_y;
1477 src_y = src_x + (p >> kScaleSubPixelBits) * src_stride;
1478 prev_p = p;
1479 dest16_y += dest_stride;
1480 dest_y += dest_stride;
1481
1482 y += 2;
1483 } while (y < height);
1484 x += 8;
1485 } while (x < width);
1486 }
1487
1488 template <bool is_compound>
ConvolveScale2D_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int subpixel_x,const int subpixel_y,const int step_x,const int step_y,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)1489 void ConvolveScale2D_NEON(const void* const reference,
1490 const ptrdiff_t reference_stride,
1491 const int horizontal_filter_index,
1492 const int vertical_filter_index, const int subpixel_x,
1493 const int subpixel_y, const int step_x,
1494 const int step_y, const int width, const int height,
1495 void* prediction, const ptrdiff_t pred_stride) {
1496 const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
1497 const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
1498 assert(step_x <= 2048);
1499 const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
1500 const int intermediate_height =
1501 (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
1502 kScaleSubPixelBits) +
1503 num_vert_taps;
1504 assert(step_x <= 2048);
1505 // The output of the horizontal filter, i.e. the intermediate_result, is
1506 // guaranteed to fit in int16_t.
1507 int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
1508 (2 * kMaxSuperBlockSizeInPixels + 8)];
1509
1510 // Horizontal filter.
1511 // Filter types used for width <= 4 are different from those for width > 4.
1512 // When width > 4, the valid filter index range is always [0, 3].
1513 // When width <= 4, the valid filter index range is always [3, 5].
1514 // Similarly for height.
1515 int filter_index = GetFilterIndex(horizontal_filter_index, width);
1516 int16_t* intermediate = intermediate_result;
1517 const ptrdiff_t src_stride = reference_stride;
1518 const auto* src = static_cast<const uint8_t*>(reference);
1519 const int vert_kernel_offset = (8 - num_vert_taps) / 2;
1520 src += vert_kernel_offset * src_stride;
1521
1522 // Derive the maximum value of |step_x| at which all source values fit in one
1523 // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
1524 // step_x*7 is the final base subpel index for the shuffle mask for filter
1525 // inputs in each iteration on large blocks. When step_x is large, we need a
1526 // larger structure and use a larger table lookup in order to gather all
1527 // filter inputs.
1528 // |num_taps| - 1 is the shuffle index of the final filter input.
1529 const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
1530 const int kernel_start_ceiling = 16 - num_horiz_taps;
1531 // This truncated quotient |grade_x_threshold| selects |step_x| such that:
1532 // (step_x * 7) >> kScaleSubPixelBits < single load limit
1533 const int grade_x_threshold =
1534 (kernel_start_ceiling << kScaleSubPixelBits) / 7;
1535 switch (filter_index) {
1536 case 0:
1537 if (step_x > grade_x_threshold) {
1538 ConvolveKernelHorizontalSigned6Tap<2>(
1539 src, src_stride, width, subpixel_x, step_x, intermediate_height,
1540 intermediate);
1541 } else {
1542 ConvolveKernelHorizontalSigned6Tap<1>(
1543 src, src_stride, width, subpixel_x, step_x, intermediate_height,
1544 intermediate);
1545 }
1546 break;
1547 case 1:
1548 if (step_x > grade_x_threshold) {
1549 ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x,
1550 step_x, intermediate_height,
1551 intermediate);
1552
1553 } else {
1554 ConvolveKernelHorizontalMixed6Tap<1>(src, src_stride, width, subpixel_x,
1555 step_x, intermediate_height,
1556 intermediate);
1557 }
1558 break;
1559 case 2:
1560 if (step_x > grade_x_threshold) {
1561 ConvolveKernelHorizontalSigned8Tap<2>(
1562 src, src_stride, width, subpixel_x, step_x, intermediate_height,
1563 intermediate);
1564 } else {
1565 ConvolveKernelHorizontalSigned8Tap<1>(
1566 src, src_stride, width, subpixel_x, step_x, intermediate_height,
1567 intermediate);
1568 }
1569 break;
1570 case 3:
1571 if (step_x > grade_x_threshold) {
1572 ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x,
1573 step_x, intermediate_height,
1574 intermediate);
1575 } else {
1576 ConvolveKernelHorizontal2Tap<1>(src, src_stride, width, subpixel_x,
1577 step_x, intermediate_height,
1578 intermediate);
1579 }
1580 break;
1581 case 4:
1582 assert(width <= 4);
1583 ConvolveKernelHorizontalSigned4Tap(src, src_stride, subpixel_x, step_x,
1584 intermediate_height, intermediate);
1585 break;
1586 default:
1587 assert(filter_index == 5);
1588 ConvolveKernelHorizontalPositive4Tap(src, src_stride, subpixel_x, step_x,
1589 intermediate_height, intermediate);
1590 }
1591 // Vertical filter.
1592 filter_index = GetFilterIndex(vertical_filter_index, height);
1593 intermediate = intermediate_result;
1594
1595 switch (filter_index) {
1596 case 0:
1597 case 1:
1598 if (step_y <= 1024) {
1599 if (!is_compound && width == 2) {
1600 ConvolveVerticalScale4xH<6, 1, 2, is_compound>(
1601 intermediate, subpixel_y, filter_index, step_y, height,
1602 prediction, pred_stride);
1603 } else if (width == 4) {
1604 ConvolveVerticalScale4xH<6, 1, 4, is_compound>(
1605 intermediate, subpixel_y, filter_index, step_y, height,
1606 prediction, pred_stride);
1607 } else {
1608 ConvolveVerticalScale<6, 1, is_compound>(
1609 intermediate, width, subpixel_y, filter_index, step_y, height,
1610 prediction, pred_stride);
1611 }
1612 } else {
1613 if (!is_compound && width == 2) {
1614 ConvolveVerticalScale4xH<6, 2, 2, is_compound>(
1615 intermediate, subpixel_y, filter_index, step_y, height,
1616 prediction, pred_stride);
1617 } else if (width == 4) {
1618 ConvolveVerticalScale4xH<6, 2, 4, is_compound>(
1619 intermediate, subpixel_y, filter_index, step_y, height,
1620 prediction, pred_stride);
1621 } else {
1622 ConvolveVerticalScale<6, 2, is_compound>(
1623 intermediate, width, subpixel_y, filter_index, step_y, height,
1624 prediction, pred_stride);
1625 }
1626 }
1627 break;
1628 case 2:
1629 if (step_y <= 1024) {
1630 if (!is_compound && width == 2) {
1631 ConvolveVerticalScale4xH<8, 1, 2, is_compound>(
1632 intermediate, subpixel_y, filter_index, step_y, height,
1633 prediction, pred_stride);
1634 } else if (width == 4) {
1635 ConvolveVerticalScale4xH<8, 1, 4, is_compound>(
1636 intermediate, subpixel_y, filter_index, step_y, height,
1637 prediction, pred_stride);
1638 } else {
1639 ConvolveVerticalScale<8, 1, is_compound>(
1640 intermediate, width, subpixel_y, filter_index, step_y, height,
1641 prediction, pred_stride);
1642 }
1643 } else {
1644 if (!is_compound && width == 2) {
1645 ConvolveVerticalScale4xH<8, 2, 2, is_compound>(
1646 intermediate, subpixel_y, filter_index, step_y, height,
1647 prediction, pred_stride);
1648 } else if (width == 4) {
1649 ConvolveVerticalScale4xH<8, 2, 4, is_compound>(
1650 intermediate, subpixel_y, filter_index, step_y, height,
1651 prediction, pred_stride);
1652 } else {
1653 ConvolveVerticalScale<8, 2, is_compound>(
1654 intermediate, width, subpixel_y, filter_index, step_y, height,
1655 prediction, pred_stride);
1656 }
1657 }
1658 break;
1659 case 3:
1660 if (step_y <= 1024) {
1661 if (!is_compound && width == 2) {
1662 ConvolveVerticalScale4xH<2, 1, 2, is_compound>(
1663 intermediate, subpixel_y, filter_index, step_y, height,
1664 prediction, pred_stride);
1665 } else if (width == 4) {
1666 ConvolveVerticalScale4xH<2, 1, 4, is_compound>(
1667 intermediate, subpixel_y, filter_index, step_y, height,
1668 prediction, pred_stride);
1669 } else {
1670 ConvolveVerticalScale<2, 1, is_compound>(
1671 intermediate, width, subpixel_y, filter_index, step_y, height,
1672 prediction, pred_stride);
1673 }
1674 } else {
1675 if (!is_compound && width == 2) {
1676 ConvolveVerticalScale4xH<2, 2, 2, is_compound>(
1677 intermediate, subpixel_y, filter_index, step_y, height,
1678 prediction, pred_stride);
1679 } else if (width == 4) {
1680 ConvolveVerticalScale4xH<2, 2, 4, is_compound>(
1681 intermediate, subpixel_y, filter_index, step_y, height,
1682 prediction, pred_stride);
1683 } else {
1684 ConvolveVerticalScale<2, 2, is_compound>(
1685 intermediate, width, subpixel_y, filter_index, step_y, height,
1686 prediction, pred_stride);
1687 }
1688 }
1689 break;
1690 case 4:
1691 default:
1692 assert(filter_index == 4 || filter_index == 5);
1693 assert(height <= 4);
1694 if (step_y <= 1024) {
1695 if (!is_compound && width == 2) {
1696 ConvolveVerticalScale4xH<4, 1, 2, is_compound>(
1697 intermediate, subpixel_y, filter_index, step_y, height,
1698 prediction, pred_stride);
1699 } else if (width == 4) {
1700 ConvolveVerticalScale4xH<4, 1, 4, is_compound>(
1701 intermediate, subpixel_y, filter_index, step_y, height,
1702 prediction, pred_stride);
1703 } else {
1704 ConvolveVerticalScale<4, 1, is_compound>(
1705 intermediate, width, subpixel_y, filter_index, step_y, height,
1706 prediction, pred_stride);
1707 }
1708 } else {
1709 if (!is_compound && width == 2) {
1710 ConvolveVerticalScale4xH<4, 2, 2, is_compound>(
1711 intermediate, subpixel_y, filter_index, step_y, height,
1712 prediction, pred_stride);
1713 } else if (width == 4) {
1714 ConvolveVerticalScale4xH<4, 2, 4, is_compound>(
1715 intermediate, subpixel_y, filter_index, step_y, height,
1716 prediction, pred_stride);
1717 } else {
1718 ConvolveVerticalScale<4, 2, is_compound>(
1719 intermediate, width, subpixel_y, filter_index, step_y, height,
1720 prediction, pred_stride);
1721 }
1722 }
1723 }
1724 }
1725
ConvolveHorizontal_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int,const int horizontal_filter_id,const int,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)1726 void ConvolveHorizontal_NEON(const void* const reference,
1727 const ptrdiff_t reference_stride,
1728 const int horizontal_filter_index,
1729 const int /*vertical_filter_index*/,
1730 const int horizontal_filter_id,
1731 const int /*vertical_filter_id*/, const int width,
1732 const int height, void* prediction,
1733 const ptrdiff_t pred_stride) {
1734 const int filter_index = GetFilterIndex(horizontal_filter_index, width);
1735 // Set |src| to the outermost tap.
1736 const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
1737 auto* dest = static_cast<uint8_t*>(prediction);
1738
1739 DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
1740 horizontal_filter_id, filter_index);
1741 }
1742
1743 // The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
1744 // Vertical calculations.
Compound1DShift(const int16x8_t sum)1745 uint16x8_t Compound1DShift(const int16x8_t sum) {
1746 return vreinterpretq_u16_s16(
1747 vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
1748 }
1749
1750 template <int filter_index, bool is_compound = false,
1751 bool negative_outside_taps = false>
FilterVertical(const uint8_t * src,const ptrdiff_t src_stride,void * const dst,const ptrdiff_t dst_stride,const int width,const int height,const uint8x8_t * const taps)1752 void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
1753 void* const dst, const ptrdiff_t dst_stride,
1754 const int width, const int height,
1755 const uint8x8_t* const taps) {
1756 const int num_taps = GetNumTapsInFilter(filter_index);
1757 const int next_row = num_taps - 1;
1758 auto* dst8 = static_cast<uint8_t*>(dst);
1759 auto* dst16 = static_cast<uint16_t*>(dst);
1760 assert(width >= 8);
1761
1762 int x = 0;
1763 do {
1764 const uint8_t* src_x = src + x;
1765 uint8x8_t srcs[8];
1766 srcs[0] = vld1_u8(src_x);
1767 src_x += src_stride;
1768 if (num_taps >= 4) {
1769 srcs[1] = vld1_u8(src_x);
1770 src_x += src_stride;
1771 srcs[2] = vld1_u8(src_x);
1772 src_x += src_stride;
1773 if (num_taps >= 6) {
1774 srcs[3] = vld1_u8(src_x);
1775 src_x += src_stride;
1776 srcs[4] = vld1_u8(src_x);
1777 src_x += src_stride;
1778 if (num_taps == 8) {
1779 srcs[5] = vld1_u8(src_x);
1780 src_x += src_stride;
1781 srcs[6] = vld1_u8(src_x);
1782 src_x += src_stride;
1783 }
1784 }
1785 }
1786
1787 int y = 0;
1788 do {
1789 srcs[next_row] = vld1_u8(src_x);
1790 src_x += src_stride;
1791
1792 const int16x8_t sums =
1793 SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1794 if (is_compound) {
1795 const uint16x8_t results = Compound1DShift(sums);
1796 vst1q_u16(dst16 + x + y * dst_stride, results);
1797 } else {
1798 const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1799 vst1_u8(dst8 + x + y * dst_stride, results);
1800 }
1801
1802 srcs[0] = srcs[1];
1803 if (num_taps >= 4) {
1804 srcs[1] = srcs[2];
1805 srcs[2] = srcs[3];
1806 if (num_taps >= 6) {
1807 srcs[3] = srcs[4];
1808 srcs[4] = srcs[5];
1809 if (num_taps == 8) {
1810 srcs[5] = srcs[6];
1811 srcs[6] = srcs[7];
1812 }
1813 }
1814 }
1815 } while (++y < height);
1816 x += 8;
1817 } while (x < width);
1818 }
1819
1820 template <int filter_index, bool is_compound = false,
1821 bool negative_outside_taps = false>
FilterVertical4xH(const uint8_t * src,const ptrdiff_t src_stride,void * const dst,const ptrdiff_t dst_stride,const int height,const uint8x8_t * const taps)1822 void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
1823 void* const dst, const ptrdiff_t dst_stride,
1824 const int height, const uint8x8_t* const taps) {
1825 const int num_taps = GetNumTapsInFilter(filter_index);
1826 auto* dst8 = static_cast<uint8_t*>(dst);
1827 auto* dst16 = static_cast<uint16_t*>(dst);
1828
1829 uint8x8_t srcs[9];
1830
1831 if (num_taps == 2) {
1832 srcs[2] = vdup_n_u8(0);
1833
1834 srcs[0] = Load4(src);
1835 src += src_stride;
1836
1837 int y = 0;
1838 do {
1839 srcs[0] = Load4<1>(src, srcs[0]);
1840 src += src_stride;
1841 srcs[2] = Load4<0>(src, srcs[2]);
1842 src += src_stride;
1843 srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1844
1845 const int16x8_t sums =
1846 SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1847 if (is_compound) {
1848 const uint16x8_t results = Compound1DShift(sums);
1849
1850 vst1q_u16(dst16, results);
1851 dst16 += 4 << 1;
1852 } else {
1853 const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1854
1855 StoreLo4(dst8, results);
1856 dst8 += dst_stride;
1857 StoreHi4(dst8, results);
1858 dst8 += dst_stride;
1859 }
1860
1861 srcs[0] = srcs[2];
1862 y += 2;
1863 } while (y < height);
1864 } else if (num_taps == 4) {
1865 srcs[4] = vdup_n_u8(0);
1866
1867 srcs[0] = Load4(src);
1868 src += src_stride;
1869 srcs[0] = Load4<1>(src, srcs[0]);
1870 src += src_stride;
1871 srcs[2] = Load4(src);
1872 src += src_stride;
1873 srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1874
1875 int y = 0;
1876 do {
1877 srcs[2] = Load4<1>(src, srcs[2]);
1878 src += src_stride;
1879 srcs[4] = Load4<0>(src, srcs[4]);
1880 src += src_stride;
1881 srcs[3] = vext_u8(srcs[2], srcs[4], 4);
1882
1883 const int16x8_t sums =
1884 SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1885 if (is_compound) {
1886 const uint16x8_t results = Compound1DShift(sums);
1887
1888 vst1q_u16(dst16, results);
1889 dst16 += 4 << 1;
1890 } else {
1891 const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1892
1893 StoreLo4(dst8, results);
1894 dst8 += dst_stride;
1895 StoreHi4(dst8, results);
1896 dst8 += dst_stride;
1897 }
1898
1899 srcs[0] = srcs[2];
1900 srcs[1] = srcs[3];
1901 srcs[2] = srcs[4];
1902 y += 2;
1903 } while (y < height);
1904 } else if (num_taps == 6) {
1905 srcs[6] = vdup_n_u8(0);
1906
1907 srcs[0] = Load4(src);
1908 src += src_stride;
1909 srcs[0] = Load4<1>(src, srcs[0]);
1910 src += src_stride;
1911 srcs[2] = Load4(src);
1912 src += src_stride;
1913 srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1914 srcs[2] = Load4<1>(src, srcs[2]);
1915 src += src_stride;
1916 srcs[4] = Load4(src);
1917 src += src_stride;
1918 srcs[3] = vext_u8(srcs[2], srcs[4], 4);
1919
1920 int y = 0;
1921 do {
1922 srcs[4] = Load4<1>(src, srcs[4]);
1923 src += src_stride;
1924 srcs[6] = Load4<0>(src, srcs[6]);
1925 src += src_stride;
1926 srcs[5] = vext_u8(srcs[4], srcs[6], 4);
1927
1928 const int16x8_t sums =
1929 SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1930 if (is_compound) {
1931 const uint16x8_t results = Compound1DShift(sums);
1932
1933 vst1q_u16(dst16, results);
1934 dst16 += 4 << 1;
1935 } else {
1936 const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1937
1938 StoreLo4(dst8, results);
1939 dst8 += dst_stride;
1940 StoreHi4(dst8, results);
1941 dst8 += dst_stride;
1942 }
1943
1944 srcs[0] = srcs[2];
1945 srcs[1] = srcs[3];
1946 srcs[2] = srcs[4];
1947 srcs[3] = srcs[5];
1948 srcs[4] = srcs[6];
1949 y += 2;
1950 } while (y < height);
1951 } else if (num_taps == 8) {
1952 srcs[8] = vdup_n_u8(0);
1953
1954 srcs[0] = Load4(src);
1955 src += src_stride;
1956 srcs[0] = Load4<1>(src, srcs[0]);
1957 src += src_stride;
1958 srcs[2] = Load4(src);
1959 src += src_stride;
1960 srcs[1] = vext_u8(srcs[0], srcs[2], 4);
1961 srcs[2] = Load4<1>(src, srcs[2]);
1962 src += src_stride;
1963 srcs[4] = Load4(src);
1964 src += src_stride;
1965 srcs[3] = vext_u8(srcs[2], srcs[4], 4);
1966 srcs[4] = Load4<1>(src, srcs[4]);
1967 src += src_stride;
1968 srcs[6] = Load4(src);
1969 src += src_stride;
1970 srcs[5] = vext_u8(srcs[4], srcs[6], 4);
1971
1972 int y = 0;
1973 do {
1974 srcs[6] = Load4<1>(src, srcs[6]);
1975 src += src_stride;
1976 srcs[8] = Load4<0>(src, srcs[8]);
1977 src += src_stride;
1978 srcs[7] = vext_u8(srcs[6], srcs[8], 4);
1979
1980 const int16x8_t sums =
1981 SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
1982 if (is_compound) {
1983 const uint16x8_t results = Compound1DShift(sums);
1984
1985 vst1q_u16(dst16, results);
1986 dst16 += 4 << 1;
1987 } else {
1988 const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
1989
1990 StoreLo4(dst8, results);
1991 dst8 += dst_stride;
1992 StoreHi4(dst8, results);
1993 dst8 += dst_stride;
1994 }
1995
1996 srcs[0] = srcs[2];
1997 srcs[1] = srcs[3];
1998 srcs[2] = srcs[4];
1999 srcs[3] = srcs[5];
2000 srcs[4] = srcs[6];
2001 srcs[5] = srcs[7];
2002 srcs[6] = srcs[8];
2003 y += 2;
2004 } while (y < height);
2005 }
2006 }
2007
2008 template <int filter_index, bool negative_outside_taps = false>
FilterVertical2xH(const uint8_t * src,const ptrdiff_t src_stride,void * const dst,const ptrdiff_t dst_stride,const int height,const uint8x8_t * const taps)2009 void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
2010 void* const dst, const ptrdiff_t dst_stride,
2011 const int height, const uint8x8_t* const taps) {
2012 const int num_taps = GetNumTapsInFilter(filter_index);
2013 auto* dst8 = static_cast<uint8_t*>(dst);
2014
2015 uint8x8_t srcs[9];
2016
2017 if (num_taps == 2) {
2018 srcs[2] = vdup_n_u8(0);
2019
2020 srcs[0] = Load2(src);
2021 src += src_stride;
2022
2023 int y = 0;
2024 do {
2025 srcs[0] = Load2<1>(src, srcs[0]);
2026 src += src_stride;
2027 srcs[0] = Load2<2>(src, srcs[0]);
2028 src += src_stride;
2029 srcs[0] = Load2<3>(src, srcs[0]);
2030 src += src_stride;
2031 srcs[2] = Load2<0>(src, srcs[2]);
2032 src += src_stride;
2033 srcs[1] = vext_u8(srcs[0], srcs[2], 2);
2034
2035 // This uses srcs[0]..srcs[1].
2036 const int16x8_t sums =
2037 SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2038 const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2039
2040 Store2<0>(dst8, results);
2041 dst8 += dst_stride;
2042 Store2<1>(dst8, results);
2043 if (height == 2) return;
2044 dst8 += dst_stride;
2045 Store2<2>(dst8, results);
2046 dst8 += dst_stride;
2047 Store2<3>(dst8, results);
2048 dst8 += dst_stride;
2049
2050 srcs[0] = srcs[2];
2051 y += 4;
2052 } while (y < height);
2053 } else if (num_taps == 4) {
2054 srcs[4] = vdup_n_u8(0);
2055
2056 srcs[0] = Load2(src);
2057 src += src_stride;
2058 srcs[0] = Load2<1>(src, srcs[0]);
2059 src += src_stride;
2060 srcs[0] = Load2<2>(src, srcs[0]);
2061 src += src_stride;
2062
2063 int y = 0;
2064 do {
2065 srcs[0] = Load2<3>(src, srcs[0]);
2066 src += src_stride;
2067 srcs[4] = Load2<0>(src, srcs[4]);
2068 src += src_stride;
2069 srcs[1] = vext_u8(srcs[0], srcs[4], 2);
2070 srcs[4] = Load2<1>(src, srcs[4]);
2071 src += src_stride;
2072 srcs[2] = vext_u8(srcs[0], srcs[4], 4);
2073 srcs[4] = Load2<2>(src, srcs[4]);
2074 src += src_stride;
2075 srcs[3] = vext_u8(srcs[0], srcs[4], 6);
2076
2077 // This uses srcs[0]..srcs[3].
2078 const int16x8_t sums =
2079 SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2080 const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2081
2082 Store2<0>(dst8, results);
2083 dst8 += dst_stride;
2084 Store2<1>(dst8, results);
2085 if (height == 2) return;
2086 dst8 += dst_stride;
2087 Store2<2>(dst8, results);
2088 dst8 += dst_stride;
2089 Store2<3>(dst8, results);
2090 dst8 += dst_stride;
2091
2092 srcs[0] = srcs[4];
2093 y += 4;
2094 } while (y < height);
2095 } else if (num_taps == 6) {
2096 // During the vertical pass the number of taps is restricted when
2097 // |height| <= 4.
2098 assert(height > 4);
2099 srcs[8] = vdup_n_u8(0);
2100
2101 srcs[0] = Load2(src);
2102 src += src_stride;
2103 srcs[0] = Load2<1>(src, srcs[0]);
2104 src += src_stride;
2105 srcs[0] = Load2<2>(src, srcs[0]);
2106 src += src_stride;
2107 srcs[0] = Load2<3>(src, srcs[0]);
2108 src += src_stride;
2109 srcs[4] = Load2(src);
2110 src += src_stride;
2111 srcs[1] = vext_u8(srcs[0], srcs[4], 2);
2112
2113 int y = 0;
2114 do {
2115 srcs[4] = Load2<1>(src, srcs[4]);
2116 src += src_stride;
2117 srcs[2] = vext_u8(srcs[0], srcs[4], 4);
2118 srcs[4] = Load2<2>(src, srcs[4]);
2119 src += src_stride;
2120 srcs[3] = vext_u8(srcs[0], srcs[4], 6);
2121 srcs[4] = Load2<3>(src, srcs[4]);
2122 src += src_stride;
2123 srcs[8] = Load2<0>(src, srcs[8]);
2124 src += src_stride;
2125 srcs[5] = vext_u8(srcs[4], srcs[8], 2);
2126
2127 // This uses srcs[0]..srcs[5].
2128 const int16x8_t sums =
2129 SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2130 const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2131
2132 Store2<0>(dst8, results);
2133 dst8 += dst_stride;
2134 Store2<1>(dst8, results);
2135 dst8 += dst_stride;
2136 Store2<2>(dst8, results);
2137 dst8 += dst_stride;
2138 Store2<3>(dst8, results);
2139 dst8 += dst_stride;
2140
2141 srcs[0] = srcs[4];
2142 srcs[1] = srcs[5];
2143 srcs[4] = srcs[8];
2144 y += 4;
2145 } while (y < height);
2146 } else if (num_taps == 8) {
2147 // During the vertical pass the number of taps is restricted when
2148 // |height| <= 4.
2149 assert(height > 4);
2150 srcs[8] = vdup_n_u8(0);
2151
2152 srcs[0] = Load2(src);
2153 src += src_stride;
2154 srcs[0] = Load2<1>(src, srcs[0]);
2155 src += src_stride;
2156 srcs[0] = Load2<2>(src, srcs[0]);
2157 src += src_stride;
2158 srcs[0] = Load2<3>(src, srcs[0]);
2159 src += src_stride;
2160 srcs[4] = Load2(src);
2161 src += src_stride;
2162 srcs[1] = vext_u8(srcs[0], srcs[4], 2);
2163 srcs[4] = Load2<1>(src, srcs[4]);
2164 src += src_stride;
2165 srcs[2] = vext_u8(srcs[0], srcs[4], 4);
2166 srcs[4] = Load2<2>(src, srcs[4]);
2167 src += src_stride;
2168 srcs[3] = vext_u8(srcs[0], srcs[4], 6);
2169
2170 int y = 0;
2171 do {
2172 srcs[4] = Load2<3>(src, srcs[4]);
2173 src += src_stride;
2174 srcs[8] = Load2<0>(src, srcs[8]);
2175 src += src_stride;
2176 srcs[5] = vext_u8(srcs[4], srcs[8], 2);
2177 srcs[8] = Load2<1>(src, srcs[8]);
2178 src += src_stride;
2179 srcs[6] = vext_u8(srcs[4], srcs[8], 4);
2180 srcs[8] = Load2<2>(src, srcs[8]);
2181 src += src_stride;
2182 srcs[7] = vext_u8(srcs[4], srcs[8], 6);
2183
2184 // This uses srcs[0]..srcs[7].
2185 const int16x8_t sums =
2186 SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
2187 const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
2188
2189 Store2<0>(dst8, results);
2190 dst8 += dst_stride;
2191 Store2<1>(dst8, results);
2192 dst8 += dst_stride;
2193 Store2<2>(dst8, results);
2194 dst8 += dst_stride;
2195 Store2<3>(dst8, results);
2196 dst8 += dst_stride;
2197
2198 srcs[0] = srcs[4];
2199 srcs[1] = srcs[5];
2200 srcs[2] = srcs[6];
2201 srcs[3] = srcs[7];
2202 srcs[4] = srcs[8];
2203 y += 4;
2204 } while (y < height);
2205 }
2206 }
2207
2208 // This function is a simplified version of Convolve2D_C.
2209 // It is called when it is single prediction mode, where only vertical
2210 // filtering is required.
2211 // The output is the single prediction of the block, clipped to valid pixel
2212 // range.
ConvolveVertical_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int vertical_filter_index,const int,const int vertical_filter_id,const int width,const int height,void * prediction,const ptrdiff_t pred_stride)2213 void ConvolveVertical_NEON(const void* const reference,
2214 const ptrdiff_t reference_stride,
2215 const int /*horizontal_filter_index*/,
2216 const int vertical_filter_index,
2217 const int /*horizontal_filter_id*/,
2218 const int vertical_filter_id, const int width,
2219 const int height, void* prediction,
2220 const ptrdiff_t pred_stride) {
2221 const int filter_index = GetFilterIndex(vertical_filter_index, height);
2222 const int vertical_taps = GetNumTapsInFilter(filter_index);
2223 const ptrdiff_t src_stride = reference_stride;
2224 const auto* src = static_cast<const uint8_t*>(reference) -
2225 (vertical_taps / 2 - 1) * src_stride;
2226 auto* dest = static_cast<uint8_t*>(prediction);
2227 const ptrdiff_t dest_stride = pred_stride;
2228 assert(vertical_filter_id != 0);
2229
2230 uint8x8_t taps[8];
2231 for (int k = 0; k < kSubPixelTaps; ++k) {
2232 taps[k] =
2233 vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
2234 }
2235
2236 if (filter_index == 0) { // 6 tap.
2237 if (width == 2) {
2238 FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height,
2239 taps + 1);
2240 } else if (width == 4) {
2241 FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height,
2242 taps + 1);
2243 } else {
2244 FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
2245 taps + 1);
2246 }
2247 } else if ((filter_index == 1) & ((vertical_filter_id == 1) |
2248 (vertical_filter_id == 15))) { // 5 tap.
2249 if (width == 2) {
2250 FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
2251 taps + 1);
2252 } else if (width == 4) {
2253 FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height,
2254 taps + 1);
2255 } else {
2256 FilterVertical<1>(src, src_stride, dest, dest_stride, width, height,
2257 taps + 1);
2258 }
2259 } else if ((filter_index == 1) &
2260 ((vertical_filter_id == 7) | (vertical_filter_id == 8) |
2261 (vertical_filter_id == 9))) { // 6 tap with weird negative taps.
2262 if (width == 2) {
2263 FilterVertical2xH<1,
2264 /*negative_outside_taps=*/true>(
2265 src, src_stride, dest, dest_stride, height, taps + 1);
2266 } else if (width == 4) {
2267 FilterVertical4xH<1, /*is_compound=*/false,
2268 /*negative_outside_taps=*/true>(
2269 src, src_stride, dest, dest_stride, height, taps + 1);
2270 } else {
2271 FilterVertical<1, /*is_compound=*/false, /*negative_outside_taps=*/true>(
2272 src, src_stride, dest, dest_stride, width, height, taps + 1);
2273 }
2274 } else if (filter_index == 2) { // 8 tap.
2275 if (width == 2) {
2276 FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
2277 } else if (width == 4) {
2278 FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
2279 } else {
2280 FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
2281 taps);
2282 }
2283 } else if (filter_index == 3) { // 2 tap.
2284 if (width == 2) {
2285 FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height,
2286 taps + 3);
2287 } else if (width == 4) {
2288 FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height,
2289 taps + 3);
2290 } else {
2291 FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
2292 taps + 3);
2293 }
2294 } else if (filter_index == 4) { // 4 tap.
2295 // Outside taps are negative.
2296 if (width == 2) {
2297 FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height,
2298 taps + 2);
2299 } else if (width == 4) {
2300 FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height,
2301 taps + 2);
2302 } else {
2303 FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
2304 taps + 2);
2305 }
2306 } else {
2307 // 4 tap. When |filter_index| == 1 the |vertical_filter_id| values listed
2308 // below map to 4 tap filters.
2309 assert(filter_index == 5 ||
2310 (filter_index == 1 &&
2311 (vertical_filter_id == 2 || vertical_filter_id == 3 ||
2312 vertical_filter_id == 4 || vertical_filter_id == 5 ||
2313 vertical_filter_id == 6 || vertical_filter_id == 10 ||
2314 vertical_filter_id == 11 || vertical_filter_id == 12 ||
2315 vertical_filter_id == 13 || vertical_filter_id == 14)));
2316 // According to GetNumTapsInFilter() this has 6 taps but here we are
2317 // treating it as though it has 4.
2318 if (filter_index == 1) src += src_stride;
2319 if (width == 2) {
2320 FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height,
2321 taps + 2);
2322 } else if (width == 4) {
2323 FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height,
2324 taps + 2);
2325 } else {
2326 FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
2327 taps + 2);
2328 }
2329 }
2330 }
2331
ConvolveCompoundCopy_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * prediction,const ptrdiff_t)2332 void ConvolveCompoundCopy_NEON(
2333 const void* const reference, const ptrdiff_t reference_stride,
2334 const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
2335 const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
2336 const int width, const int height, void* prediction,
2337 const ptrdiff_t /*pred_stride*/) {
2338 const auto* src = static_cast<const uint8_t*>(reference);
2339 const ptrdiff_t src_stride = reference_stride;
2340 auto* dest = static_cast<uint16_t*>(prediction);
2341 constexpr int final_shift =
2342 kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
2343
2344 if (width >= 16) {
2345 int y = 0;
2346 do {
2347 int x = 0;
2348 do {
2349 const uint8x16_t v_src = vld1q_u8(&src[x]);
2350 const uint16x8_t v_dest_lo =
2351 vshll_n_u8(vget_low_u8(v_src), final_shift);
2352 const uint16x8_t v_dest_hi =
2353 vshll_n_u8(vget_high_u8(v_src), final_shift);
2354 vst1q_u16(&dest[x], v_dest_lo);
2355 x += 8;
2356 vst1q_u16(&dest[x], v_dest_hi);
2357 x += 8;
2358 } while (x < width);
2359 src += src_stride;
2360 dest += width;
2361 } while (++y < height);
2362 } else if (width == 8) {
2363 int y = 0;
2364 do {
2365 const uint8x8_t v_src = vld1_u8(&src[0]);
2366 const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
2367 vst1q_u16(&dest[0], v_dest);
2368 src += src_stride;
2369 dest += width;
2370 } while (++y < height);
2371 } else { /* width == 4 */
2372 uint8x8_t v_src = vdup_n_u8(0);
2373
2374 int y = 0;
2375 do {
2376 v_src = Load4<0>(&src[0], v_src);
2377 src += src_stride;
2378 v_src = Load4<1>(&src[0], v_src);
2379 src += src_stride;
2380 const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
2381 vst1q_u16(&dest[0], v_dest);
2382 dest += 4 << 1;
2383 y += 2;
2384 } while (y < height);
2385 }
2386 }
2387
ConvolveCompoundVertical_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int vertical_filter_index,const int,const int vertical_filter_id,const int width,const int height,void * prediction,const ptrdiff_t)2388 void ConvolveCompoundVertical_NEON(
2389 const void* const reference, const ptrdiff_t reference_stride,
2390 const int /*horizontal_filter_index*/, const int vertical_filter_index,
2391 const int /*horizontal_filter_id*/, const int vertical_filter_id,
2392 const int width, const int height, void* prediction,
2393 const ptrdiff_t /*pred_stride*/) {
2394 const int filter_index = GetFilterIndex(vertical_filter_index, height);
2395 const int vertical_taps = GetNumTapsInFilter(filter_index);
2396 const ptrdiff_t src_stride = reference_stride;
2397 const auto* src = static_cast<const uint8_t*>(reference) -
2398 (vertical_taps / 2 - 1) * src_stride;
2399 auto* dest = static_cast<uint16_t*>(prediction);
2400 assert(vertical_filter_id != 0);
2401
2402 uint8x8_t taps[8];
2403 for (int k = 0; k < kSubPixelTaps; ++k) {
2404 taps[k] =
2405 vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
2406 }
2407
2408 if (filter_index == 0) { // 6 tap.
2409 if (width == 4) {
2410 FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
2411 height, taps + 1);
2412 } else {
2413 FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
2414 width, height, taps + 1);
2415 }
2416 } else if ((filter_index == 1) & ((vertical_filter_id == 1) |
2417 (vertical_filter_id == 15))) { // 5 tap.
2418 if (width == 4) {
2419 FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
2420 height, taps + 1);
2421 } else {
2422 FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width,
2423 width, height, taps + 1);
2424 }
2425 } else if ((filter_index == 1) &
2426 ((vertical_filter_id == 7) | (vertical_filter_id == 8) |
2427 (vertical_filter_id == 9))) { // 6 tap with weird negative taps.
2428 if (width == 4) {
2429 FilterVertical4xH<1, /*is_compound=*/true,
2430 /*negative_outside_taps=*/true>(src, src_stride, dest,
2431 4, height, taps + 1);
2432 } else {
2433 FilterVertical<1, /*is_compound=*/true, /*negative_outside_taps=*/true>(
2434 src, src_stride, dest, width, width, height, taps + 1);
2435 }
2436 } else if (filter_index == 2) { // 8 tap.
2437 if (width == 4) {
2438 FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
2439 height, taps);
2440 } else {
2441 FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
2442 width, height, taps);
2443 }
2444 } else if (filter_index == 3) { // 2 tap.
2445 if (width == 4) {
2446 FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
2447 height, taps + 3);
2448 } else {
2449 FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
2450 width, height, taps + 3);
2451 }
2452 } else if (filter_index == 4) { // 4 tap.
2453 if (width == 4) {
2454 FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
2455 height, taps + 2);
2456 } else {
2457 FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
2458 width, height, taps + 2);
2459 }
2460 } else {
2461 // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
2462 // to 4 tap filters.
2463 assert(filter_index == 5 ||
2464 (filter_index == 1 &&
2465 (vertical_filter_id == 2 || vertical_filter_id == 3 ||
2466 vertical_filter_id == 4 || vertical_filter_id == 5 ||
2467 vertical_filter_id == 6 || vertical_filter_id == 10 ||
2468 vertical_filter_id == 11 || vertical_filter_id == 12 ||
2469 vertical_filter_id == 13 || vertical_filter_id == 14)));
2470 // According to GetNumTapsInFilter() this has 6 taps but here we are
2471 // treating it as though it has 4.
2472 if (filter_index == 1) src += src_stride;
2473 if (width == 4) {
2474 FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
2475 height, taps + 2);
2476 } else {
2477 FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
2478 width, height, taps + 2);
2479 }
2480 }
2481 }
2482
ConvolveCompoundHorizontal_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int,const int horizontal_filter_id,const int,const int width,const int height,void * prediction,const ptrdiff_t)2483 void ConvolveCompoundHorizontal_NEON(
2484 const void* const reference, const ptrdiff_t reference_stride,
2485 const int horizontal_filter_index, const int /*vertical_filter_index*/,
2486 const int horizontal_filter_id, const int /*vertical_filter_id*/,
2487 const int width, const int height, void* prediction,
2488 const ptrdiff_t /*pred_stride*/) {
2489 const int filter_index = GetFilterIndex(horizontal_filter_index, width);
2490 const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
2491 auto* dest = static_cast<uint16_t*>(prediction);
2492
2493 DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
2494 src, reference_stride, dest, width, width, height, horizontal_filter_id,
2495 filter_index);
2496 }
2497
ConvolveCompound2D_NEON(const void * const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int horizontal_filter_id,const int vertical_filter_id,const int width,const int height,void * prediction,const ptrdiff_t)2498 void ConvolveCompound2D_NEON(const void* const reference,
2499 const ptrdiff_t reference_stride,
2500 const int horizontal_filter_index,
2501 const int vertical_filter_index,
2502 const int horizontal_filter_id,
2503 const int vertical_filter_id, const int width,
2504 const int height, void* prediction,
2505 const ptrdiff_t /*pred_stride*/) {
2506 // The output of the horizontal filter, i.e. the intermediate_result, is
2507 // guaranteed to fit in int16_t.
2508 uint16_t
2509 intermediate_result[kMaxSuperBlockSizeInPixels *
2510 (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
2511
2512 // Horizontal filter.
2513 // Filter types used for width <= 4 are different from those for width > 4.
2514 // When width > 4, the valid filter index range is always [0, 3].
2515 // When width <= 4, the valid filter index range is always [4, 5].
2516 // Similarly for height.
2517 const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
2518 const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
2519 const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
2520 const int intermediate_height = height + vertical_taps - 1;
2521 const ptrdiff_t src_stride = reference_stride;
2522 const auto* const src = static_cast<const uint8_t*>(reference) -
2523 (vertical_taps / 2 - 1) * src_stride -
2524 kHorizontalOffset;
2525
2526 DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
2527 src, src_stride, intermediate_result, width, width, intermediate_height,
2528 horizontal_filter_id, horiz_filter_index);
2529
2530 // Vertical filter.
2531 auto* dest = static_cast<uint16_t*>(prediction);
2532 assert(vertical_filter_id != 0);
2533
2534 const ptrdiff_t dest_stride = width;
2535 const int16x8_t taps = vmovl_s8(
2536 vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
2537
2538 if (vertical_taps == 8) {
2539 if (width == 4) {
2540 Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
2541 dest_stride, height, taps);
2542 } else {
2543 Filter2DVertical<8, /*is_compound=*/true>(
2544 intermediate_result, dest, dest_stride, width, height, taps);
2545 }
2546 } else if (vertical_taps == 6) {
2547 if (width == 4) {
2548 Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
2549 dest_stride, height, taps);
2550 } else {
2551 Filter2DVertical<6, /*is_compound=*/true>(
2552 intermediate_result, dest, dest_stride, width, height, taps);
2553 }
2554 } else if (vertical_taps == 4) {
2555 if (width == 4) {
2556 Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
2557 dest_stride, height, taps);
2558 } else {
2559 Filter2DVertical<4, /*is_compound=*/true>(
2560 intermediate_result, dest, dest_stride, width, height, taps);
2561 }
2562 } else { // |vertical_taps| == 2
2563 if (width == 4) {
2564 Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
2565 dest_stride, height, taps);
2566 } else {
2567 Filter2DVertical<2, /*is_compound=*/true>(
2568 intermediate_result, dest, dest_stride, width, height, taps);
2569 }
2570 }
2571 }
2572
HalfAddHorizontal(const uint8_t * src,uint8_t * dst)2573 inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
2574 const uint8x16_t left = vld1q_u8(src);
2575 const uint8x16_t right = vld1q_u8(src + 1);
2576 vst1q_u8(dst, vrhaddq_u8(left, right));
2577 }
2578
2579 template <int width>
IntraBlockCopyHorizontal(const uint8_t * src,const ptrdiff_t src_stride,const int height,uint8_t * dst,const ptrdiff_t dst_stride)2580 inline void IntraBlockCopyHorizontal(const uint8_t* src,
2581 const ptrdiff_t src_stride,
2582 const int height, uint8_t* dst,
2583 const ptrdiff_t dst_stride) {
2584 const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
2585 const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
2586
2587 int y = 0;
2588 do {
2589 HalfAddHorizontal(src, dst);
2590 if (width >= 32) {
2591 src += 16;
2592 dst += 16;
2593 HalfAddHorizontal(src, dst);
2594 if (width >= 64) {
2595 src += 16;
2596 dst += 16;
2597 HalfAddHorizontal(src, dst);
2598 src += 16;
2599 dst += 16;
2600 HalfAddHorizontal(src, dst);
2601 if (width == 128) {
2602 src += 16;
2603 dst += 16;
2604 HalfAddHorizontal(src, dst);
2605 src += 16;
2606 dst += 16;
2607 HalfAddHorizontal(src, dst);
2608 src += 16;
2609 dst += 16;
2610 HalfAddHorizontal(src, dst);
2611 src += 16;
2612 dst += 16;
2613 HalfAddHorizontal(src, dst);
2614 }
2615 }
2616 }
2617 src += src_remainder_stride;
2618 dst += dst_remainder_stride;
2619 } while (++y < height);
2620 }
2621
ConvolveIntraBlockCopyHorizontal_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * const prediction,const ptrdiff_t pred_stride)2622 void ConvolveIntraBlockCopyHorizontal_NEON(
2623 const void* const reference, const ptrdiff_t reference_stride,
2624 const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
2625 const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
2626 const int height, void* const prediction, const ptrdiff_t pred_stride) {
2627 const auto* src = static_cast<const uint8_t*>(reference);
2628 auto* dest = static_cast<uint8_t*>(prediction);
2629
2630 if (width == 128) {
2631 IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
2632 pred_stride);
2633 } else if (width == 64) {
2634 IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
2635 pred_stride);
2636 } else if (width == 32) {
2637 IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
2638 pred_stride);
2639 } else if (width == 16) {
2640 IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
2641 pred_stride);
2642 } else if (width == 8) {
2643 int y = 0;
2644 do {
2645 const uint8x8_t left = vld1_u8(src);
2646 const uint8x8_t right = vld1_u8(src + 1);
2647 vst1_u8(dest, vrhadd_u8(left, right));
2648
2649 src += reference_stride;
2650 dest += pred_stride;
2651 } while (++y < height);
2652 } else if (width == 4) {
2653 uint8x8_t left = vdup_n_u8(0);
2654 uint8x8_t right = vdup_n_u8(0);
2655 int y = 0;
2656 do {
2657 left = Load4<0>(src, left);
2658 right = Load4<0>(src + 1, right);
2659 src += reference_stride;
2660 left = Load4<1>(src, left);
2661 right = Load4<1>(src + 1, right);
2662 src += reference_stride;
2663
2664 const uint8x8_t result = vrhadd_u8(left, right);
2665
2666 StoreLo4(dest, result);
2667 dest += pred_stride;
2668 StoreHi4(dest, result);
2669 dest += pred_stride;
2670 y += 2;
2671 } while (y < height);
2672 } else {
2673 assert(width == 2);
2674 uint8x8_t left = vdup_n_u8(0);
2675 uint8x8_t right = vdup_n_u8(0);
2676 int y = 0;
2677 do {
2678 left = Load2<0>(src, left);
2679 right = Load2<0>(src + 1, right);
2680 src += reference_stride;
2681 left = Load2<1>(src, left);
2682 right = Load2<1>(src + 1, right);
2683 src += reference_stride;
2684
2685 const uint8x8_t result = vrhadd_u8(left, right);
2686
2687 Store2<0>(dest, result);
2688 dest += pred_stride;
2689 Store2<1>(dest, result);
2690 dest += pred_stride;
2691 y += 2;
2692 } while (y < height);
2693 }
2694 }
2695
2696 template <int width>
IntraBlockCopyVertical(const uint8_t * src,const ptrdiff_t src_stride,const int height,uint8_t * dst,const ptrdiff_t dst_stride)2697 inline void IntraBlockCopyVertical(const uint8_t* src,
2698 const ptrdiff_t src_stride, const int height,
2699 uint8_t* dst, const ptrdiff_t dst_stride) {
2700 const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
2701 const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
2702 uint8x16_t row[8], below[8];
2703
2704 row[0] = vld1q_u8(src);
2705 if (width >= 32) {
2706 src += 16;
2707 row[1] = vld1q_u8(src);
2708 if (width >= 64) {
2709 src += 16;
2710 row[2] = vld1q_u8(src);
2711 src += 16;
2712 row[3] = vld1q_u8(src);
2713 if (width == 128) {
2714 src += 16;
2715 row[4] = vld1q_u8(src);
2716 src += 16;
2717 row[5] = vld1q_u8(src);
2718 src += 16;
2719 row[6] = vld1q_u8(src);
2720 src += 16;
2721 row[7] = vld1q_u8(src);
2722 }
2723 }
2724 }
2725 src += src_remainder_stride;
2726
2727 int y = 0;
2728 do {
2729 below[0] = vld1q_u8(src);
2730 if (width >= 32) {
2731 src += 16;
2732 below[1] = vld1q_u8(src);
2733 if (width >= 64) {
2734 src += 16;
2735 below[2] = vld1q_u8(src);
2736 src += 16;
2737 below[3] = vld1q_u8(src);
2738 if (width == 128) {
2739 src += 16;
2740 below[4] = vld1q_u8(src);
2741 src += 16;
2742 below[5] = vld1q_u8(src);
2743 src += 16;
2744 below[6] = vld1q_u8(src);
2745 src += 16;
2746 below[7] = vld1q_u8(src);
2747 }
2748 }
2749 }
2750 src += src_remainder_stride;
2751
2752 vst1q_u8(dst, vrhaddq_u8(row[0], below[0]));
2753 row[0] = below[0];
2754 if (width >= 32) {
2755 dst += 16;
2756 vst1q_u8(dst, vrhaddq_u8(row[1], below[1]));
2757 row[1] = below[1];
2758 if (width >= 64) {
2759 dst += 16;
2760 vst1q_u8(dst, vrhaddq_u8(row[2], below[2]));
2761 row[2] = below[2];
2762 dst += 16;
2763 vst1q_u8(dst, vrhaddq_u8(row[3], below[3]));
2764 row[3] = below[3];
2765 if (width >= 128) {
2766 dst += 16;
2767 vst1q_u8(dst, vrhaddq_u8(row[4], below[4]));
2768 row[4] = below[4];
2769 dst += 16;
2770 vst1q_u8(dst, vrhaddq_u8(row[5], below[5]));
2771 row[5] = below[5];
2772 dst += 16;
2773 vst1q_u8(dst, vrhaddq_u8(row[6], below[6]));
2774 row[6] = below[6];
2775 dst += 16;
2776 vst1q_u8(dst, vrhaddq_u8(row[7], below[7]));
2777 row[7] = below[7];
2778 }
2779 }
2780 }
2781 dst += dst_remainder_stride;
2782 } while (++y < height);
2783 }
2784
ConvolveIntraBlockCopyVertical_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * const prediction,const ptrdiff_t pred_stride)2785 void ConvolveIntraBlockCopyVertical_NEON(
2786 const void* const reference, const ptrdiff_t reference_stride,
2787 const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
2788 const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
2789 const int width, const int height, void* const prediction,
2790 const ptrdiff_t pred_stride) {
2791 const auto* src = static_cast<const uint8_t*>(reference);
2792 auto* dest = static_cast<uint8_t*>(prediction);
2793
2794 if (width == 128) {
2795 IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
2796 pred_stride);
2797 } else if (width == 64) {
2798 IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
2799 pred_stride);
2800 } else if (width == 32) {
2801 IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
2802 pred_stride);
2803 } else if (width == 16) {
2804 IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
2805 pred_stride);
2806 } else if (width == 8) {
2807 uint8x8_t row, below;
2808 row = vld1_u8(src);
2809 src += reference_stride;
2810
2811 int y = 0;
2812 do {
2813 below = vld1_u8(src);
2814 src += reference_stride;
2815
2816 vst1_u8(dest, vrhadd_u8(row, below));
2817 dest += pred_stride;
2818
2819 row = below;
2820 } while (++y < height);
2821 } else if (width == 4) {
2822 uint8x8_t row = Load4(src);
2823 uint8x8_t below = vdup_n_u8(0);
2824 src += reference_stride;
2825
2826 int y = 0;
2827 do {
2828 below = Load4<0>(src, below);
2829 src += reference_stride;
2830
2831 StoreLo4(dest, vrhadd_u8(row, below));
2832 dest += pred_stride;
2833
2834 row = below;
2835 } while (++y < height);
2836 } else {
2837 assert(width == 2);
2838 uint8x8_t row = Load2(src);
2839 uint8x8_t below = vdup_n_u8(0);
2840 src += reference_stride;
2841
2842 int y = 0;
2843 do {
2844 below = Load2<0>(src, below);
2845 src += reference_stride;
2846
2847 Store2<0>(dest, vrhadd_u8(row, below));
2848 dest += pred_stride;
2849
2850 row = below;
2851 } while (++y < height);
2852 }
2853 }
2854
2855 template <int width>
IntraBlockCopy2D(const uint8_t * src,const ptrdiff_t src_stride,const int height,uint8_t * dst,const ptrdiff_t dst_stride)2856 inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
2857 const int height, uint8_t* dst,
2858 const ptrdiff_t dst_stride) {
2859 const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
2860 const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
2861 uint16x8_t row[16];
2862 row[0] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2863 if (width >= 16) {
2864 src += 8;
2865 row[1] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2866 if (width >= 32) {
2867 src += 8;
2868 row[2] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2869 src += 8;
2870 row[3] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2871 if (width >= 64) {
2872 src += 8;
2873 row[4] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2874 src += 8;
2875 row[5] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2876 src += 8;
2877 row[6] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2878 src += 8;
2879 row[7] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2880 if (width == 128) {
2881 src += 8;
2882 row[8] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2883 src += 8;
2884 row[9] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2885 src += 8;
2886 row[10] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2887 src += 8;
2888 row[11] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2889 src += 8;
2890 row[12] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2891 src += 8;
2892 row[13] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2893 src += 8;
2894 row[14] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2895 src += 8;
2896 row[15] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2897 }
2898 }
2899 }
2900 }
2901 src += src_remainder_stride;
2902
2903 int y = 0;
2904 do {
2905 const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2906 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
2907 row[0] = below_0;
2908 if (width >= 16) {
2909 src += 8;
2910 dst += 8;
2911
2912 const uint16x8_t below_1 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2913 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[1], below_1), 2));
2914 row[1] = below_1;
2915 if (width >= 32) {
2916 src += 8;
2917 dst += 8;
2918
2919 const uint16x8_t below_2 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2920 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[2], below_2), 2));
2921 row[2] = below_2;
2922 src += 8;
2923 dst += 8;
2924
2925 const uint16x8_t below_3 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2926 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[3], below_3), 2));
2927 row[3] = below_3;
2928 if (width >= 64) {
2929 src += 8;
2930 dst += 8;
2931
2932 const uint16x8_t below_4 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2933 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[4], below_4), 2));
2934 row[4] = below_4;
2935 src += 8;
2936 dst += 8;
2937
2938 const uint16x8_t below_5 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2939 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[5], below_5), 2));
2940 row[5] = below_5;
2941 src += 8;
2942 dst += 8;
2943
2944 const uint16x8_t below_6 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2945 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[6], below_6), 2));
2946 row[6] = below_6;
2947 src += 8;
2948 dst += 8;
2949
2950 const uint16x8_t below_7 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2951 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[7], below_7), 2));
2952 row[7] = below_7;
2953 if (width == 128) {
2954 src += 8;
2955 dst += 8;
2956
2957 const uint16x8_t below_8 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2958 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[8], below_8), 2));
2959 row[8] = below_8;
2960 src += 8;
2961 dst += 8;
2962
2963 const uint16x8_t below_9 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2964 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[9], below_9), 2));
2965 row[9] = below_9;
2966 src += 8;
2967 dst += 8;
2968
2969 const uint16x8_t below_10 =
2970 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2971 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[10], below_10), 2));
2972 row[10] = below_10;
2973 src += 8;
2974 dst += 8;
2975
2976 const uint16x8_t below_11 =
2977 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2978 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[11], below_11), 2));
2979 row[11] = below_11;
2980 src += 8;
2981 dst += 8;
2982
2983 const uint16x8_t below_12 =
2984 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2985 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[12], below_12), 2));
2986 row[12] = below_12;
2987 src += 8;
2988 dst += 8;
2989
2990 const uint16x8_t below_13 =
2991 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2992 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[13], below_13), 2));
2993 row[13] = below_13;
2994 src += 8;
2995 dst += 8;
2996
2997 const uint16x8_t below_14 =
2998 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
2999 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[14], below_14), 2));
3000 row[14] = below_14;
3001 src += 8;
3002 dst += 8;
3003
3004 const uint16x8_t below_15 =
3005 vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
3006 vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[15], below_15), 2));
3007 row[15] = below_15;
3008 }
3009 }
3010 }
3011 }
3012 src += src_remainder_stride;
3013 dst += dst_remainder_stride;
3014 } while (++y < height);
3015 }
3016
ConvolveIntraBlockCopy2D_NEON(const void * const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * const prediction,const ptrdiff_t pred_stride)3017 void ConvolveIntraBlockCopy2D_NEON(
3018 const void* const reference, const ptrdiff_t reference_stride,
3019 const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
3020 const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
3021 const int width, const int height, void* const prediction,
3022 const ptrdiff_t pred_stride) {
3023 const auto* src = static_cast<const uint8_t*>(reference);
3024 auto* dest = static_cast<uint8_t*>(prediction);
3025 // Note: allow vertical access to height + 1. Because this function is only
3026 // for u/v plane of intra block copy, such access is guaranteed to be within
3027 // the prediction block.
3028
3029 if (width == 128) {
3030 IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
3031 } else if (width == 64) {
3032 IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
3033 } else if (width == 32) {
3034 IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
3035 } else if (width == 16) {
3036 IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
3037 } else if (width == 8) {
3038 IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
3039 } else if (width == 4) {
3040 uint8x8_t left = Load4(src);
3041 uint8x8_t right = Load4(src + 1);
3042 src += reference_stride;
3043
3044 uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
3045
3046 int y = 0;
3047 do {
3048 left = Load4<0>(src, left);
3049 right = Load4<0>(src + 1, right);
3050 src += reference_stride;
3051 left = Load4<1>(src, left);
3052 right = Load4<1>(src + 1, right);
3053 src += reference_stride;
3054
3055 const uint16x8_t below = vaddl_u8(left, right);
3056
3057 const uint8x8_t result = vrshrn_n_u16(
3058 vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
3059 StoreLo4(dest, result);
3060 dest += pred_stride;
3061 StoreHi4(dest, result);
3062 dest += pred_stride;
3063
3064 row = vget_high_u16(below);
3065 y += 2;
3066 } while (y < height);
3067 } else {
3068 uint8x8_t left = Load2(src);
3069 uint8x8_t right = Load2(src + 1);
3070 src += reference_stride;
3071
3072 uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
3073
3074 int y = 0;
3075 do {
3076 left = Load2<0>(src, left);
3077 right = Load2<0>(src + 1, right);
3078 src += reference_stride;
3079 left = Load2<2>(src, left);
3080 right = Load2<2>(src + 1, right);
3081 src += reference_stride;
3082
3083 const uint16x8_t below = vaddl_u8(left, right);
3084
3085 const uint8x8_t result = vrshrn_n_u16(
3086 vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
3087 Store2<0>(dest, result);
3088 dest += pred_stride;
3089 Store2<2>(dest, result);
3090 dest += pred_stride;
3091
3092 row = vget_high_u16(below);
3093 y += 2;
3094 } while (y < height);
3095 }
3096 }
3097
Init8bpp()3098 void Init8bpp() {
3099 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
3100 assert(dsp != nullptr);
3101 dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON;
3102 dsp->convolve[0][0][1][0] = ConvolveVertical_NEON;
3103 dsp->convolve[0][0][1][1] = Convolve2D_NEON;
3104
3105 dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON;
3106 dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON;
3107 dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON;
3108 dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON;
3109
3110 dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_NEON;
3111 dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_NEON;
3112 dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_NEON;
3113
3114 dsp->convolve_scale[0] = ConvolveScale2D_NEON<false>;
3115 dsp->convolve_scale[1] = ConvolveScale2D_NEON<true>;
3116 }
3117
3118 } // namespace
3119 } // namespace low_bitdepth
3120
ConvolveInit_NEON()3121 void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); }
3122
3123 } // namespace dsp
3124 } // namespace libgav1
3125
3126 #else // !LIBGAV1_ENABLE_NEON
3127
3128 namespace libgav1 {
3129 namespace dsp {
3130
ConvolveInit_NEON()3131 void ConvolveInit_NEON() {}
3132
3133 } // namespace dsp
3134 } // namespace libgav1
3135 #endif // LIBGAV1_ENABLE_NEON
3136