1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/motion_vector_search.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_ENABLE_NEON
19 
20 #include <arm_neon.h>
21 
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25 
26 #include "src/dsp/constants.h"
27 #include "src/dsp/dsp.h"
28 #include "src/utils/common.h"
29 #include "src/utils/constants.h"
30 #include "src/utils/types.h"
31 
32 namespace libgav1 {
33 namespace dsp {
34 namespace {
35 
MvProjection(const int16x4_t mv,const int16x4_t denominator,const int32x4_t numerator)36 inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
37                               const int32x4_t numerator) {
38   const int32x4_t m0 = vmull_s16(mv, denominator);
39   const int32x4_t m = vmulq_s32(m0, numerator);
40   // Add the sign (0 or -1) to round towards zero.
41   const int32x4_t add_sign = vsraq_n_s32(m, m, 31);
42   return vqrshrn_n_s32(add_sign, 14);
43 }
44 
MvProjectionCompound(const int16x4_t mv,const int temporal_reference_offsets,const int reference_offsets[2])45 inline int16x4_t MvProjectionCompound(const int16x4_t mv,
46                                       const int temporal_reference_offsets,
47                                       const int reference_offsets[2]) {
48   const int16x4_t denominator =
49       vdup_n_s16(kProjectionMvDivisionLookup[temporal_reference_offsets]);
50   const int32x2_t offset = vld1_s32(reference_offsets);
51   const int32x2x2_t offsets = vzip_s32(offset, offset);
52   const int32x4_t numerator = vcombine_s32(offsets.val[0], offsets.val[1]);
53   return MvProjection(mv, denominator, numerator);
54 }
55 
ProjectionClip(const int16x4_t mv0,const int16x4_t mv1)56 inline int16x8_t ProjectionClip(const int16x4_t mv0, const int16x4_t mv1) {
57   const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
58   const int16x8_t mv = vcombine_s16(mv0, mv1);
59   const int16x8_t clamp = vminq_s16(mv, projection_mv_clamp);
60   return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
61 }
62 
MvProjectionCompoundClip(const MotionVector * LIBGAV1_RESTRICT const temporal_mvs,const int8_t * LIBGAV1_RESTRICT const temporal_reference_offsets,const int reference_offsets[2])63 inline int16x8_t MvProjectionCompoundClip(
64     const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
65     const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
66     const int reference_offsets[2]) {
67   const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
68   const int32x2_t temporal_mv = vld1_s32(tmvs);
69   const int16x4_t tmv0 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 0));
70   const int16x4_t tmv1 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 1));
71   const int16x4_t mv0 = MvProjectionCompound(
72       tmv0, temporal_reference_offsets[0], reference_offsets);
73   const int16x4_t mv1 = MvProjectionCompound(
74       tmv1, temporal_reference_offsets[1], reference_offsets);
75   return ProjectionClip(mv0, mv1);
76 }
77 
MvProjectionSingleClip(const MotionVector * LIBGAV1_RESTRICT const temporal_mvs,const int8_t * LIBGAV1_RESTRICT const temporal_reference_offsets,const int reference_offset,int16x4_t * const lookup)78 inline int16x8_t MvProjectionSingleClip(
79     const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
80     const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
81     const int reference_offset, int16x4_t* const lookup) {
82   const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
83   const int16x8_t temporal_mv = vld1q_s16(tmvs);
84   *lookup = vld1_lane_s16(
85       &kProjectionMvDivisionLookup[temporal_reference_offsets[0]], *lookup, 0);
86   *lookup = vld1_lane_s16(
87       &kProjectionMvDivisionLookup[temporal_reference_offsets[1]], *lookup, 1);
88   *lookup = vld1_lane_s16(
89       &kProjectionMvDivisionLookup[temporal_reference_offsets[2]], *lookup, 2);
90   *lookup = vld1_lane_s16(
91       &kProjectionMvDivisionLookup[temporal_reference_offsets[3]], *lookup, 3);
92   const int16x4x2_t denominator = vzip_s16(*lookup, *lookup);
93   const int16x4_t tmv0 = vget_low_s16(temporal_mv);
94   const int16x4_t tmv1 = vget_high_s16(temporal_mv);
95   const int32x4_t numerator = vdupq_n_s32(reference_offset);
96   const int16x4_t mv0 = MvProjection(tmv0, denominator.val[0], numerator);
97   const int16x4_t mv1 = MvProjection(tmv1, denominator.val[1], numerator);
98   return ProjectionClip(mv0, mv1);
99 }
100 
LowPrecision(const int16x8_t mv,void * const candidate_mvs)101 inline void LowPrecision(const int16x8_t mv, void* const candidate_mvs) {
102   const int16x8_t kRoundDownMask = vdupq_n_s16(1);
103   const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
104   const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
105   const int16x8_t mv1 = vbicq_s16(mv0, kRoundDownMask);
106   vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv1);
107 }
108 
ForceInteger(const int16x8_t mv,void * const candidate_mvs)109 inline void ForceInteger(const int16x8_t mv, void* const candidate_mvs) {
110   const int16x8_t kRoundDownMask = vdupq_n_s16(7);
111   const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
112   const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
113   const int16x8_t mv1 = vaddq_s16(mv0, vdupq_n_s16(3));
114   const int16x8_t mv2 = vbicq_s16(mv1, kRoundDownMask);
115   vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv2);
116 }
117 
MvProjectionCompoundLowPrecision_NEON(const MotionVector * LIBGAV1_RESTRICT temporal_mvs,const int8_t * LIBGAV1_RESTRICT temporal_reference_offsets,const int reference_offsets[2],const int count,CompoundMotionVector * LIBGAV1_RESTRICT candidate_mvs)118 void MvProjectionCompoundLowPrecision_NEON(
119     const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
120     const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
121     const int reference_offsets[2], const int count,
122     CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
123   // |reference_offsets| non-zero check usually equals true and is ignored.
124   // To facilitate the compilers, make a local copy of |reference_offsets|.
125   const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
126   // One more element could be calculated.
127   int loop_count = (count + 1) >> 1;
128   do {
129     const int16x8_t mv = MvProjectionCompoundClip(
130         temporal_mvs, temporal_reference_offsets, offsets);
131     LowPrecision(mv, candidate_mvs);
132     temporal_mvs += 2;
133     temporal_reference_offsets += 2;
134     candidate_mvs += 2;
135   } while (--loop_count != 0);
136 }
137 
MvProjectionCompoundForceInteger_NEON(const MotionVector * LIBGAV1_RESTRICT temporal_mvs,const int8_t * LIBGAV1_RESTRICT temporal_reference_offsets,const int reference_offsets[2],const int count,CompoundMotionVector * LIBGAV1_RESTRICT candidate_mvs)138 void MvProjectionCompoundForceInteger_NEON(
139     const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
140     const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
141     const int reference_offsets[2], const int count,
142     CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
143   // |reference_offsets| non-zero check usually equals true and is ignored.
144   // To facilitate the compilers, make a local copy of |reference_offsets|.
145   const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
146   // One more element could be calculated.
147   int loop_count = (count + 1) >> 1;
148   do {
149     const int16x8_t mv = MvProjectionCompoundClip(
150         temporal_mvs, temporal_reference_offsets, offsets);
151     ForceInteger(mv, candidate_mvs);
152     temporal_mvs += 2;
153     temporal_reference_offsets += 2;
154     candidate_mvs += 2;
155   } while (--loop_count != 0);
156 }
157 
MvProjectionCompoundHighPrecision_NEON(const MotionVector * LIBGAV1_RESTRICT temporal_mvs,const int8_t * LIBGAV1_RESTRICT temporal_reference_offsets,const int reference_offsets[2],const int count,CompoundMotionVector * LIBGAV1_RESTRICT candidate_mvs)158 void MvProjectionCompoundHighPrecision_NEON(
159     const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
160     const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
161     const int reference_offsets[2], const int count,
162     CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
163   // |reference_offsets| non-zero check usually equals true and is ignored.
164   // To facilitate the compilers, make a local copy of |reference_offsets|.
165   const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
166   // One more element could be calculated.
167   int loop_count = (count + 1) >> 1;
168   do {
169     const int16x8_t mv = MvProjectionCompoundClip(
170         temporal_mvs, temporal_reference_offsets, offsets);
171     vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv);
172     temporal_mvs += 2;
173     temporal_reference_offsets += 2;
174     candidate_mvs += 2;
175   } while (--loop_count != 0);
176 }
177 
MvProjectionSingleLowPrecision_NEON(const MotionVector * LIBGAV1_RESTRICT temporal_mvs,const int8_t * LIBGAV1_RESTRICT temporal_reference_offsets,const int reference_offset,const int count,MotionVector * LIBGAV1_RESTRICT candidate_mvs)178 void MvProjectionSingleLowPrecision_NEON(
179     const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
180     const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
181     const int reference_offset, const int count,
182     MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
183   // Up to three more elements could be calculated.
184   int loop_count = (count + 3) >> 2;
185   int16x4_t lookup = vdup_n_s16(0);
186   do {
187     const int16x8_t mv = MvProjectionSingleClip(
188         temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
189     LowPrecision(mv, candidate_mvs);
190     temporal_mvs += 4;
191     temporal_reference_offsets += 4;
192     candidate_mvs += 4;
193   } while (--loop_count != 0);
194 }
195 
MvProjectionSingleForceInteger_NEON(const MotionVector * LIBGAV1_RESTRICT temporal_mvs,const int8_t * LIBGAV1_RESTRICT temporal_reference_offsets,const int reference_offset,const int count,MotionVector * LIBGAV1_RESTRICT candidate_mvs)196 void MvProjectionSingleForceInteger_NEON(
197     const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
198     const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
199     const int reference_offset, const int count,
200     MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
201   // Up to three more elements could be calculated.
202   int loop_count = (count + 3) >> 2;
203   int16x4_t lookup = vdup_n_s16(0);
204   do {
205     const int16x8_t mv = MvProjectionSingleClip(
206         temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
207     ForceInteger(mv, candidate_mvs);
208     temporal_mvs += 4;
209     temporal_reference_offsets += 4;
210     candidate_mvs += 4;
211   } while (--loop_count != 0);
212 }
213 
MvProjectionSingleHighPrecision_NEON(const MotionVector * LIBGAV1_RESTRICT temporal_mvs,const int8_t * LIBGAV1_RESTRICT temporal_reference_offsets,const int reference_offset,const int count,MotionVector * LIBGAV1_RESTRICT candidate_mvs)214 void MvProjectionSingleHighPrecision_NEON(
215     const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
216     const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
217     const int reference_offset, const int count,
218     MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
219   // Up to three more elements could be calculated.
220   int loop_count = (count + 3) >> 2;
221   int16x4_t lookup = vdup_n_s16(0);
222   do {
223     const int16x8_t mv = MvProjectionSingleClip(
224         temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
225     vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv);
226     temporal_mvs += 4;
227     temporal_reference_offsets += 4;
228     candidate_mvs += 4;
229   } while (--loop_count != 0);
230 }
231 
232 }  // namespace
233 
MotionVectorSearchInit_NEON()234 void MotionVectorSearchInit_NEON() {
235   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
236   assert(dsp != nullptr);
237   dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_NEON;
238   dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_NEON;
239   dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_NEON;
240   dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_NEON;
241   dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_NEON;
242   dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_NEON;
243 }
244 
245 }  // namespace dsp
246 }  // namespace libgav1
247 
248 #else   // !LIBGAV1_ENABLE_NEON
249 namespace libgav1 {
250 namespace dsp {
251 
MotionVectorSearchInit_NEON()252 void MotionVectorSearchInit_NEON() {}
253 
254 }  // namespace dsp
255 }  // namespace libgav1
256 #endif  // LIBGAV1_ENABLE_NEON
257