1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/motion_field_projection.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_ENABLE_NEON
19 
20 #include <arm_neon.h>
21 
22 #include <algorithm>
23 #include <cassert>
24 #include <cstddef>
25 #include <cstdint>
26 
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/utils/common.h"
30 #include "src/utils/constants.h"
31 #include "src/utils/types.h"
32 
33 namespace libgav1 {
34 namespace dsp {
35 namespace {
36 
Project_NEON(const int16x8_t delta,const int16x8_t dst_sign)37 inline int8x8_t Project_NEON(const int16x8_t delta, const int16x8_t dst_sign) {
38   // Add 63 to negative delta so that it shifts towards zero.
39   const int16x8_t delta_sign = vshrq_n_s16(delta, 15);
40   const uint16x8_t delta_u = vreinterpretq_u16_s16(delta);
41   const uint16x8_t delta_sign_u = vreinterpretq_u16_s16(delta_sign);
42   const uint16x8_t delta_adjust_u = vsraq_n_u16(delta_u, delta_sign_u, 10);
43   const int16x8_t delta_adjust = vreinterpretq_s16_u16(delta_adjust_u);
44   const int16x8_t offset0 = vshrq_n_s16(delta_adjust, 6);
45   const int16x8_t offset1 = veorq_s16(offset0, dst_sign);
46   const int16x8_t offset2 = vsubq_s16(offset1, dst_sign);
47   return vqmovn_s16(offset2);
48 }
49 
LookupTable(const int8x8x4_t division_table,const int8x16_t idx)50 inline int16x8_t LookupTable(const int8x8x4_t division_table,
51                              const int8x16_t idx) {
52   const int8x8_t idx_low = vget_low_s8(idx);
53   const int8x8_t idx_high = vget_high_s8(idx);
54   const int16x4_t d0 = vreinterpret_s16_s8(vtbl4_s8(division_table, idx_low));
55   const int16x4_t d1 = vreinterpret_s16_s8(vtbl4_s8(division_table, idx_high));
56   return vcombine_s16(d0, d1);
57 }
58 
LoadDivision(const int8x8x4_t division_table[2],const int8x8_t reference_offset)59 inline int16x8_t LoadDivision(const int8x8x4_t division_table[2],
60                               const int8x8_t reference_offset) {
61   const int8x16_t k32 = vdupq_n_s8(32);
62   const int8x8_t kOne = vcreate_s8(0x0100010001000100);
63   const int8x16_t kOneQ = vcombine_s8(kOne, kOne);
64   const int8x8_t t = vadd_s8(reference_offset, reference_offset);
65   const int8x8x2_t tt = vzip_s8(t, t);
66   const int8x16_t t1 = vcombine_s8(tt.val[0], tt.val[1]);
67   const int8x16_t idx0 = vaddq_s8(t1, kOneQ);
68   const int8x16_t idx1 = vsubq_s8(idx0, k32);
69   const int16x8_t denorm0 = LookupTable(division_table[0], idx0);
70   const int16x8_t denorm1 = LookupTable(division_table[1], idx1);
71   return vorrq_s16(denorm0, denorm1);
72 }
73 
MvProjection(const int16x4_t mv,const int16x4_t denominator,const int numerator)74 inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
75                               const int numerator) {
76   const int32x4_t m0 = vmull_s16(mv, denominator);
77   const int32x4_t m = vmulq_n_s32(m0, numerator);
78   // Subtract the sign bit to round towards zero.
79   const int32x4_t sub_sign = vsraq_n_s32(m, m, 31);
80   return vqrshrn_n_s32(sub_sign, 14);
81 }
82 
MvProjectionClip(const int16x8_t mv,const int16x8_t denominator,const int numerator)83 inline int16x8_t MvProjectionClip(const int16x8_t mv,
84                                   const int16x8_t denominator,
85                                   const int numerator) {
86   const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
87   const int16x4_t mv0 = vget_low_s16(mv);
88   const int16x4_t mv1 = vget_high_s16(mv);
89   const int16x4_t m0 = MvProjection(mv0, vget_low_s16(denominator), numerator);
90   const int16x4_t m1 = MvProjection(mv1, vget_high_s16(denominator), numerator);
91   const int16x8_t m = vcombine_s16(m0, m1);
92   const int16x8_t clamp = vminq_s16(m, projection_mv_clamp);
93   return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
94 }
95 
GetMvProjection(const int32x4_t mv[2],const int16x8_t denominator,const int numerator,int16x8_t projection_mv[2])96 inline void GetMvProjection(const int32x4_t mv[2], const int16x8_t denominator,
97                             const int numerator, int16x8_t projection_mv[2]) {
98   const int16x8_t mv0 = vreinterpretq_s16_s32(mv[0]);
99   const int16x8_t mv1 = vreinterpretq_s16_s32(mv[1]);
100   // Deinterlace
101   const int16x8x2_t mvs = vuzpq_s16(mv0, mv1);
102   projection_mv[0] = MvProjectionClip(mvs.val[0], denominator, numerator);
103   projection_mv[1] = MvProjectionClip(mvs.val[1], denominator, numerator);
104 }
105 
GetPosition(const int8x8x4_t division_table[2],const MotionVector * const mv,const int reference_to_current_with_sign,const int x8_start,const int x8_end,const int x8,const int8x8_t r_offsets,const int8x8_t source_reference_type8,const int8x8_t skip_r,const int8x8_t y8_floor8,const int8x8_t y8_ceiling8,const int16x8_t d_sign,const int delta,int8x8_t * const r,int8x8_t * const position_y8,int8x8_t * const position_x8,int64_t * const skip_64,int32x4_t mvs[2])106 void GetPosition(const int8x8x4_t division_table[2],
107                  const MotionVector* const mv,
108                  const int reference_to_current_with_sign, const int x8_start,
109                  const int x8_end, const int x8, const int8x8_t r_offsets,
110                  const int8x8_t source_reference_type8, const int8x8_t skip_r,
111                  const int8x8_t y8_floor8, const int8x8_t y8_ceiling8,
112                  const int16x8_t d_sign, const int delta, int8x8_t* const r,
113                  int8x8_t* const position_y8, int8x8_t* const position_x8,
114                  int64_t* const skip_64, int32x4_t mvs[2]) {
115   const int32_t* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
116   *r = vtbl1_s8(r_offsets, source_reference_type8);
117   const int16x8_t denorm = LoadDivision(division_table, *r);
118   int16x8_t projection_mv[2];
119   mvs[0] = vld1q_s32(mv_int + 0);
120   mvs[1] = vld1q_s32(mv_int + 4);
121   // reference_to_current_with_sign could be 0.
122   GetMvProjection(mvs, denorm, reference_to_current_with_sign, projection_mv);
123   // Do not update the motion vector if the block position is not valid or
124   // if position_x8 is outside the current range of x8_start and x8_end.
125   // Note that position_y8 will always be within the range of y8_start and
126   // y8_end.
127   // After subtracting the base, valid projections are within 8-bit.
128   *position_y8 = Project_NEON(projection_mv[0], d_sign);
129   const int8x8_t position_x = Project_NEON(projection_mv[1], d_sign);
130   const int8x8_t k01234567 = vcreate_s8(uint64_t{0x0706050403020100});
131   *position_x8 = vqadd_s8(position_x, k01234567);
132   const int8x16_t position_xy = vcombine_s8(*position_x8, *position_y8);
133   const int x8_floor = std::max(
134       x8_start - x8, delta - kProjectionMvMaxHorizontalOffset);  // [-8, 8]
135   const int x8_ceiling = std::min(
136       x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset);  // [0, 16]
137   const int8x8_t x8_floor8 = vdup_n_s8(x8_floor);
138   const int8x8_t x8_ceiling8 = vdup_n_s8(x8_ceiling);
139   const int8x16_t floor_xy = vcombine_s8(x8_floor8, y8_floor8);
140   const int8x16_t ceiling_xy = vcombine_s8(x8_ceiling8, y8_ceiling8);
141   const uint8x16_t underflow = vcltq_s8(position_xy, floor_xy);
142   const uint8x16_t overflow = vcgeq_s8(position_xy, ceiling_xy);
143   const int8x16_t out = vreinterpretq_s8_u8(vorrq_u8(underflow, overflow));
144   const int8x8_t skip_low = vorr_s8(skip_r, vget_low_s8(out));
145   const int8x8_t skip = vorr_s8(skip_low, vget_high_s8(out));
146   *skip_64 = vget_lane_s64(vreinterpret_s64_s8(skip), 0);
147 }
148 
149 template <int idx>
VgetqLaneS16(const int16x8_t src)150 int16_t VgetqLaneS16(const int16x8_t src) {
151   if (idx == 0) return vgetq_lane_s16(src, 0);
152   if (idx == 1) return vgetq_lane_s16(src, 1);
153   if (idx == 2) return vgetq_lane_s16(src, 2);
154   if (idx == 3) return vgetq_lane_s16(src, 3);
155   if (idx == 4) return vgetq_lane_s16(src, 4);
156   if (idx == 5) return vgetq_lane_s16(src, 5);
157   if (idx == 6) return vgetq_lane_s16(src, 6);
158   return vgetq_lane_s16(src, 7);
159 }
160 
161 template <int idx>
Store(const int16x8_t position,const int8x8_t reference_offset,const int32x4_t mvs,int8_t * dst_reference_offset,MotionVector * dst_mv)162 inline void Store(const int16x8_t position, const int8x8_t reference_offset,
163                   const int32x4_t mvs, int8_t* dst_reference_offset,
164                   MotionVector* dst_mv) {
165   const ptrdiff_t offset = VgetqLaneS16<idx>(position);
166   int32_t* const d_mv = reinterpret_cast<int32_t*>(&dst_mv[offset]);
167   vst1q_lane_s32(d_mv, mvs, idx & 3);
168   vst1_lane_s8(&dst_reference_offset[offset], reference_offset, idx);
169 }
170 
171 template <int idx>
CheckStore(const int8_t * skips,const int16x8_t position,const int8x8_t reference_offset,const int32x4_t mvs,int8_t * dst_reference_offset,MotionVector * dst_mv)172 inline void CheckStore(const int8_t* skips, const int16x8_t position,
173                        const int8x8_t reference_offset, const int32x4_t mvs,
174                        int8_t* dst_reference_offset, MotionVector* dst_mv) {
175   if (skips[idx] == 0) {
176     const ptrdiff_t offset = VgetqLaneS16<idx>(position);
177     int32_t* const d_mv = reinterpret_cast<int32_t*>(&dst_mv[offset]);
178     vst1q_lane_s32(d_mv, mvs, idx & 3);
179     vst1_lane_s8(&dst_reference_offset[offset], reference_offset, idx);
180   }
181 }
182 
183 // 7.9.2.
MotionFieldProjectionKernel_NEON(const ReferenceFrameType * source_reference_type,const MotionVector * mv,const uint8_t order_hint[kNumReferenceFrameTypes],unsigned int current_frame_order_hint,unsigned int order_hint_shift_bits,int reference_to_current_with_sign,int dst_sign,int y8_start,int y8_end,int x8_start,int x8_end,TemporalMotionField * motion_field)184 void MotionFieldProjectionKernel_NEON(
185     const ReferenceFrameType* source_reference_type, const MotionVector* mv,
186     const uint8_t order_hint[kNumReferenceFrameTypes],
187     unsigned int current_frame_order_hint, unsigned int order_hint_shift_bits,
188     int reference_to_current_with_sign, int dst_sign, int y8_start, int y8_end,
189     int x8_start, int x8_end, TemporalMotionField* motion_field) {
190   const ptrdiff_t stride = motion_field->mv.columns();
191   // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
192   // coordinates in that range could end up being position_x8 because of
193   // projection.
194   const int adjusted_x8_start =
195       std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
196   const int adjusted_x8_end = std::min(
197       x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
198   const int adjusted_x8_end8 = adjusted_x8_end & ~7;
199   const int leftover = adjusted_x8_end - adjusted_x8_end8;
200   const int8_t* const table =
201       reinterpret_cast<const int8_t*>(kProjectionMvDivisionLookup);
202   int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
203   MotionVector* dst_mv = motion_field->mv[y8_start];
204   const int16x8_t d_sign = vdupq_n_s16(dst_sign);
205   int8_t reference_offsets[kNumReferenceFrameTypes];
206   bool skip_reference[kNumReferenceFrameTypes];
207   int8x8x4_t division_table[2];
208 
209   static_assert(sizeof(int8_t) == sizeof(bool), "");
210   static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
211   static_assert(sizeof(int32_t) == sizeof(MotionVector), "");
212   assert(dst_sign == 0 || dst_sign == -1);
213   assert(stride == motion_field->reference_offset.columns());
214   assert((y8_start & 7) == 0);
215   assert((adjusted_x8_start & 7) == 0);
216   // The final position calculation is represented with int16_t. Valid
217   // position_y8 from its base is at most 7. After considering the horizontal
218   // offset which is at most |stride - 1|, we have the following assertion,
219   // which means this optimization works for frame width up to 32K (each
220   // position is a 8x8 block).
221   assert(8 * stride <= 32768);
222 
223   const int8x8_t current_order_hints = vdup_n_s8(current_frame_order_hint);
224   const int8x8_t order_hints = vreinterpret_s8_u8(vld1_u8(order_hint));
225   const int8x8_t diff = vsub_s8(current_order_hints, order_hints);
226   // |order_hint_shift_bits| - 24 could be -24. In this case diff is 0,
227   // and the behavior of left or right shifting -24 bits is defined for ARM NEON
228   // instructions, and the result of shifting 0 is still 0.
229   const int8x8_t left_shift_bits = vdup_n_s8(order_hint_shift_bits - 24);
230   const int8x8_t diff_shift_left = vshl_s8(diff, left_shift_bits);
231   const int8x8_t r_offsets = vshl_s8(diff_shift_left, vneg_s8(left_shift_bits));
232   const uint8x8_t overflow = vcgt_s8(r_offsets, vdup_n_s8(kMaxFrameDistance));
233   const uint8x8_t underflow = vcle_s8(r_offsets, vdup_n_s8(0));
234   const int8x8_t sk = vreinterpret_s8_u8(vorr_u8(overflow, underflow));
235   // Initialize skip_reference[kReferenceFrameIntra] to simplify branch
236   // conditions in projection.
237   const int8x8_t skip_reference8 = vset_lane_s8(-1, sk, 0);
238   vst1_s8(reinterpret_cast<int8_t*>(skip_reference), skip_reference8);
239   vst1_s8(reference_offsets, r_offsets);
240 
241   // The compiler is inefficient when using vld4_s64(). Instructions waste in
242   // copying from int64x1x4_t to int8x8x4_t, and there is no such vector
243   // reinterpret intrinsics available to the best of our knowledge. Anyway
244   // compiler is good enough to use 4 vld1q_s8().
245   division_table[0].val[0] = vld1_s8(table + 0 * 8);
246   division_table[0].val[1] = vld1_s8(table + 1 * 8);
247   division_table[0].val[2] = vld1_s8(table + 2 * 8);
248   division_table[0].val[3] = vld1_s8(table + 3 * 8);
249   division_table[1].val[0] = vld1_s8(table + 4 * 8);
250   division_table[1].val[1] = vld1_s8(table + 5 * 8);
251   division_table[1].val[2] = vld1_s8(table + 6 * 8);
252   division_table[1].val[3] = vld1_s8(table + 7 * 8);
253 
254   int y8 = y8_start;
255   do {
256     const int y8_floor = (y8 & ~7) - y8;                         // [-7, 0]
257     const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8);  // [1, 8]
258     const int8x8_t y8_floor8 = vdup_n_s8(y8_floor);
259     const int8x8_t y8_ceiling8 = vdup_n_s8(y8_ceiling);
260     int x8;
261 
262     for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
263       const int8x8_t source_reference_type8 =
264           vld1_s8(reinterpret_cast<const int8_t*>(source_reference_type + x8));
265       const int8x8_t skip_r = vtbl1_s8(skip_reference8, source_reference_type8);
266       const int64_t early_skip = vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
267       // Early termination #1 if all are skips. Chance is typically ~30-40%.
268       if (early_skip == -1) continue;
269       int64_t skip_64;
270       int8x8_t r, position_x8, position_y8;
271       int32x4_t mvs[2];
272       GetPosition(division_table, mv, reference_to_current_with_sign, x8_start,
273                   x8_end, x8, r_offsets, source_reference_type8, skip_r,
274                   y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_y8,
275                   &position_x8, &skip_64, mvs);
276       // Early termination #2 if all are skips.
277       // Chance is typically ~15-25% after Early termination #1.
278       if (skip_64 == -1) continue;
279       const int16x8_t p_y = vmovl_s8(position_y8);
280       const int16x8_t p_x = vmovl_s8(position_x8);
281       const int16x8_t p_xy = vmlaq_n_s16(p_x, p_y, stride);
282       const int16x8_t position = vaddq_s16(p_xy, vdupq_n_s16(x8));
283       if (skip_64 == 0) {
284         // Store all. Chance is typically ~70-85% after Early termination #2.
285         Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
286         Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
287         Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
288         Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
289         Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
290         Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
291         Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
292         Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
293       } else {
294         // Check and store each.
295         // Chance is typically ~15-30% after Early termination #2.
296         // The compiler is smart enough to not create the local buffer skips[].
297         int8_t skips[8];
298         memcpy(skips, &skip_64, sizeof(skips));
299         CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
300         CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
301         CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
302         CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
303         CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
304         CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
305         CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
306         CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
307       }
308     }
309 
310     // The following leftover processing cannot be moved out of the do...while
311     // loop. Doing so may change the result storing orders of the same position.
312     if (leftover > 0) {
313       // Use SIMD only when leftover is at least 4, and there are at least 8
314       // elements in a row.
315       if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) {
316         // Process the last 8 elements to avoid loading invalid memory. Some
317         // elements may have been processed in the above loop, which is OK.
318         const int delta = 8 - leftover;
319         x8 = adjusted_x8_end - 8;
320         const int8x8_t source_reference_type8 = vld1_s8(
321             reinterpret_cast<const int8_t*>(source_reference_type + x8));
322         const int8x8_t skip_r =
323             vtbl1_s8(skip_reference8, source_reference_type8);
324         const int64_t early_skip =
325             vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
326         // Early termination #1 if all are skips.
327         if (early_skip != -1) {
328           int64_t skip_64;
329           int8x8_t r, position_x8, position_y8;
330           int32x4_t mvs[2];
331           GetPosition(division_table, mv, reference_to_current_with_sign,
332                       x8_start, x8_end, x8, r_offsets, source_reference_type8,
333                       skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r,
334                       &position_y8, &position_x8, &skip_64, mvs);
335           // Early termination #2 if all are skips.
336           if (skip_64 != -1) {
337             const int16x8_t p_y = vmovl_s8(position_y8);
338             const int16x8_t p_x = vmovl_s8(position_x8);
339             const int16x8_t p_xy = vmlaq_n_s16(p_x, p_y, stride);
340             const int16x8_t position = vaddq_s16(p_xy, vdupq_n_s16(x8));
341             // Store up to 7 elements since leftover is at most 7.
342             if (skip_64 == 0) {
343               // Store all.
344               Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
345               Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
346               Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
347               Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
348               Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
349               Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
350               Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
351             } else {
352               // Check and store each.
353               // The compiler is smart enough to not create the local buffer
354               // skips[].
355               int8_t skips[8];
356               memcpy(skips, &skip_64, sizeof(skips));
357               CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset,
358                             dst_mv);
359               CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset,
360                             dst_mv);
361               CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset,
362                             dst_mv);
363               CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset,
364                             dst_mv);
365               CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset,
366                             dst_mv);
367               CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset,
368                             dst_mv);
369               CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset,
370                             dst_mv);
371             }
372           }
373         }
374       } else {
375         for (; x8 < adjusted_x8_end; ++x8) {
376           if (skip_reference[source_reference_type[x8]]) continue;
377           const int reference_offset =
378               reference_offsets[source_reference_type[x8]];
379           MotionVector projection_mv;
380           // reference_to_current_with_sign could be 0.
381           GetMvProjection(mv[x8], reference_to_current_with_sign,
382                           reference_offset, &projection_mv);
383           // Do not update the motion vector if the block position is not valid
384           // or if position_x8 is outside the current range of x8_start and
385           // x8_end. Note that position_y8 will always be within the range of
386           // y8_start and y8_end.
387           const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
388           if (position_y8 < y8_floor || position_y8 >= y8_ceiling) continue;
389           const int x8_base = x8 & ~7;
390           const int x8_floor =
391               std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
392           const int x8_ceiling =
393               std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
394           const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
395           if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
396           dst_mv[position_y8 * stride + position_x8] = mv[x8];
397           dst_reference_offset[position_y8 * stride + position_x8] =
398               reference_offset;
399         }
400       }
401     }
402 
403     source_reference_type += stride;
404     mv += stride;
405     dst_reference_offset += stride;
406     dst_mv += stride;
407   } while (++y8 < y8_end);
408 }
409 
Init8bpp()410 void Init8bpp() {
411   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
412   assert(dsp != nullptr);
413   dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_NEON;
414 }
415 
416 #if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp()417 void Init10bpp() {
418   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
419   assert(dsp != nullptr);
420   dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_NEON;
421 }
422 #endif
423 
424 }  // namespace
425 
MotionFieldProjectionInit_NEON()426 void MotionFieldProjectionInit_NEON() {
427   Init8bpp();
428 #if LIBGAV1_MAX_BITDEPTH >= 10
429   Init10bpp();
430 #endif
431 }
432 
433 }  // namespace dsp
434 }  // namespace libgav1
435 
436 #else   // !LIBGAV1_ENABLE_NEON
437 namespace libgav1 {
438 namespace dsp {
439 
MotionFieldProjectionInit_NEON()440 void MotionFieldProjectionInit_NEON() {}
441 
442 }  // namespace dsp
443 }  // namespace libgav1
444 #endif  // LIBGAV1_ENABLE_NEON
445