1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/motion_vector_search.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_ENABLE_NEON
19
20 #include <arm_neon.h>
21
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25
26 #include "src/dsp/constants.h"
27 #include "src/dsp/dsp.h"
28 #include "src/utils/common.h"
29 #include "src/utils/constants.h"
30 #include "src/utils/types.h"
31
32 namespace libgav1 {
33 namespace dsp {
34 namespace {
35
MvProjection(const int16x4_t mv,const int16x4_t denominator,const int32x4_t numerator)36 inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
37 const int32x4_t numerator) {
38 const int32x4_t m0 = vmull_s16(mv, denominator);
39 const int32x4_t m = vmulq_s32(m0, numerator);
40 // Add the sign (0 or -1) to round towards zero.
41 const int32x4_t add_sign = vsraq_n_s32(m, m, 31);
42 return vqrshrn_n_s32(add_sign, 14);
43 }
44
MvProjectionCompound(const int16x4_t mv,const int temporal_reference_offsets,const int reference_offsets[2])45 inline int16x4_t MvProjectionCompound(const int16x4_t mv,
46 const int temporal_reference_offsets,
47 const int reference_offsets[2]) {
48 const int16x4_t denominator =
49 vdup_n_s16(kProjectionMvDivisionLookup[temporal_reference_offsets]);
50 const int32x2_t offset = vld1_s32(reference_offsets);
51 const int32x2x2_t offsets = vzip_s32(offset, offset);
52 const int32x4_t numerator = vcombine_s32(offsets.val[0], offsets.val[1]);
53 return MvProjection(mv, denominator, numerator);
54 }
55
ProjectionClip(const int16x4_t mv0,const int16x4_t mv1)56 inline int16x8_t ProjectionClip(const int16x4_t mv0, const int16x4_t mv1) {
57 const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
58 const int16x8_t mv = vcombine_s16(mv0, mv1);
59 const int16x8_t clamp = vminq_s16(mv, projection_mv_clamp);
60 return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
61 }
62
MvProjectionCompoundClip(const MotionVector * LIBGAV1_RESTRICT const temporal_mvs,const int8_t * LIBGAV1_RESTRICT const temporal_reference_offsets,const int reference_offsets[2])63 inline int16x8_t MvProjectionCompoundClip(
64 const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
65 const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
66 const int reference_offsets[2]) {
67 const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
68 const int32x2_t temporal_mv = vld1_s32(tmvs);
69 const int16x4_t tmv0 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 0));
70 const int16x4_t tmv1 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 1));
71 const int16x4_t mv0 = MvProjectionCompound(
72 tmv0, temporal_reference_offsets[0], reference_offsets);
73 const int16x4_t mv1 = MvProjectionCompound(
74 tmv1, temporal_reference_offsets[1], reference_offsets);
75 return ProjectionClip(mv0, mv1);
76 }
77
MvProjectionSingleClip(const MotionVector * LIBGAV1_RESTRICT const temporal_mvs,const int8_t * LIBGAV1_RESTRICT const temporal_reference_offsets,const int reference_offset,int16x4_t * const lookup)78 inline int16x8_t MvProjectionSingleClip(
79 const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
80 const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
81 const int reference_offset, int16x4_t* const lookup) {
82 const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
83 const int16x8_t temporal_mv = vld1q_s16(tmvs);
84 *lookup = vld1_lane_s16(
85 &kProjectionMvDivisionLookup[temporal_reference_offsets[0]], *lookup, 0);
86 *lookup = vld1_lane_s16(
87 &kProjectionMvDivisionLookup[temporal_reference_offsets[1]], *lookup, 1);
88 *lookup = vld1_lane_s16(
89 &kProjectionMvDivisionLookup[temporal_reference_offsets[2]], *lookup, 2);
90 *lookup = vld1_lane_s16(
91 &kProjectionMvDivisionLookup[temporal_reference_offsets[3]], *lookup, 3);
92 const int16x4x2_t denominator = vzip_s16(*lookup, *lookup);
93 const int16x4_t tmv0 = vget_low_s16(temporal_mv);
94 const int16x4_t tmv1 = vget_high_s16(temporal_mv);
95 const int32x4_t numerator = vdupq_n_s32(reference_offset);
96 const int16x4_t mv0 = MvProjection(tmv0, denominator.val[0], numerator);
97 const int16x4_t mv1 = MvProjection(tmv1, denominator.val[1], numerator);
98 return ProjectionClip(mv0, mv1);
99 }
100
LowPrecision(const int16x8_t mv,void * const candidate_mvs)101 inline void LowPrecision(const int16x8_t mv, void* const candidate_mvs) {
102 const int16x8_t kRoundDownMask = vdupq_n_s16(1);
103 const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
104 const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
105 const int16x8_t mv1 = vbicq_s16(mv0, kRoundDownMask);
106 vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv1);
107 }
108
ForceInteger(const int16x8_t mv,void * const candidate_mvs)109 inline void ForceInteger(const int16x8_t mv, void* const candidate_mvs) {
110 const int16x8_t kRoundDownMask = vdupq_n_s16(7);
111 const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
112 const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
113 const int16x8_t mv1 = vaddq_s16(mv0, vdupq_n_s16(3));
114 const int16x8_t mv2 = vbicq_s16(mv1, kRoundDownMask);
115 vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv2);
116 }
117
MvProjectionCompoundLowPrecision_NEON(const MotionVector * LIBGAV1_RESTRICT temporal_mvs,const int8_t * LIBGAV1_RESTRICT temporal_reference_offsets,const int reference_offsets[2],const int count,CompoundMotionVector * LIBGAV1_RESTRICT candidate_mvs)118 void MvProjectionCompoundLowPrecision_NEON(
119 const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
120 const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
121 const int reference_offsets[2], const int count,
122 CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
123 // |reference_offsets| non-zero check usually equals true and is ignored.
124 // To facilitate the compilers, make a local copy of |reference_offsets|.
125 const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
126 // One more element could be calculated.
127 int loop_count = (count + 1) >> 1;
128 do {
129 const int16x8_t mv = MvProjectionCompoundClip(
130 temporal_mvs, temporal_reference_offsets, offsets);
131 LowPrecision(mv, candidate_mvs);
132 temporal_mvs += 2;
133 temporal_reference_offsets += 2;
134 candidate_mvs += 2;
135 } while (--loop_count != 0);
136 }
137
MvProjectionCompoundForceInteger_NEON(const MotionVector * LIBGAV1_RESTRICT temporal_mvs,const int8_t * LIBGAV1_RESTRICT temporal_reference_offsets,const int reference_offsets[2],const int count,CompoundMotionVector * LIBGAV1_RESTRICT candidate_mvs)138 void MvProjectionCompoundForceInteger_NEON(
139 const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
140 const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
141 const int reference_offsets[2], const int count,
142 CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
143 // |reference_offsets| non-zero check usually equals true and is ignored.
144 // To facilitate the compilers, make a local copy of |reference_offsets|.
145 const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
146 // One more element could be calculated.
147 int loop_count = (count + 1) >> 1;
148 do {
149 const int16x8_t mv = MvProjectionCompoundClip(
150 temporal_mvs, temporal_reference_offsets, offsets);
151 ForceInteger(mv, candidate_mvs);
152 temporal_mvs += 2;
153 temporal_reference_offsets += 2;
154 candidate_mvs += 2;
155 } while (--loop_count != 0);
156 }
157
MvProjectionCompoundHighPrecision_NEON(const MotionVector * LIBGAV1_RESTRICT temporal_mvs,const int8_t * LIBGAV1_RESTRICT temporal_reference_offsets,const int reference_offsets[2],const int count,CompoundMotionVector * LIBGAV1_RESTRICT candidate_mvs)158 void MvProjectionCompoundHighPrecision_NEON(
159 const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
160 const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
161 const int reference_offsets[2], const int count,
162 CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
163 // |reference_offsets| non-zero check usually equals true and is ignored.
164 // To facilitate the compilers, make a local copy of |reference_offsets|.
165 const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
166 // One more element could be calculated.
167 int loop_count = (count + 1) >> 1;
168 do {
169 const int16x8_t mv = MvProjectionCompoundClip(
170 temporal_mvs, temporal_reference_offsets, offsets);
171 vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv);
172 temporal_mvs += 2;
173 temporal_reference_offsets += 2;
174 candidate_mvs += 2;
175 } while (--loop_count != 0);
176 }
177
MvProjectionSingleLowPrecision_NEON(const MotionVector * LIBGAV1_RESTRICT temporal_mvs,const int8_t * LIBGAV1_RESTRICT temporal_reference_offsets,const int reference_offset,const int count,MotionVector * LIBGAV1_RESTRICT candidate_mvs)178 void MvProjectionSingleLowPrecision_NEON(
179 const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
180 const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
181 const int reference_offset, const int count,
182 MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
183 // Up to three more elements could be calculated.
184 int loop_count = (count + 3) >> 2;
185 int16x4_t lookup = vdup_n_s16(0);
186 do {
187 const int16x8_t mv = MvProjectionSingleClip(
188 temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
189 LowPrecision(mv, candidate_mvs);
190 temporal_mvs += 4;
191 temporal_reference_offsets += 4;
192 candidate_mvs += 4;
193 } while (--loop_count != 0);
194 }
195
MvProjectionSingleForceInteger_NEON(const MotionVector * LIBGAV1_RESTRICT temporal_mvs,const int8_t * LIBGAV1_RESTRICT temporal_reference_offsets,const int reference_offset,const int count,MotionVector * LIBGAV1_RESTRICT candidate_mvs)196 void MvProjectionSingleForceInteger_NEON(
197 const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
198 const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
199 const int reference_offset, const int count,
200 MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
201 // Up to three more elements could be calculated.
202 int loop_count = (count + 3) >> 2;
203 int16x4_t lookup = vdup_n_s16(0);
204 do {
205 const int16x8_t mv = MvProjectionSingleClip(
206 temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
207 ForceInteger(mv, candidate_mvs);
208 temporal_mvs += 4;
209 temporal_reference_offsets += 4;
210 candidate_mvs += 4;
211 } while (--loop_count != 0);
212 }
213
MvProjectionSingleHighPrecision_NEON(const MotionVector * LIBGAV1_RESTRICT temporal_mvs,const int8_t * LIBGAV1_RESTRICT temporal_reference_offsets,const int reference_offset,const int count,MotionVector * LIBGAV1_RESTRICT candidate_mvs)214 void MvProjectionSingleHighPrecision_NEON(
215 const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
216 const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
217 const int reference_offset, const int count,
218 MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
219 // Up to three more elements could be calculated.
220 int loop_count = (count + 3) >> 2;
221 int16x4_t lookup = vdup_n_s16(0);
222 do {
223 const int16x8_t mv = MvProjectionSingleClip(
224 temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
225 vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv);
226 temporal_mvs += 4;
227 temporal_reference_offsets += 4;
228 candidate_mvs += 4;
229 } while (--loop_count != 0);
230 }
231
232 } // namespace
233
MotionVectorSearchInit_NEON()234 void MotionVectorSearchInit_NEON() {
235 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
236 assert(dsp != nullptr);
237 dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_NEON;
238 dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_NEON;
239 dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_NEON;
240 dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_NEON;
241 dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_NEON;
242 dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_NEON;
243 }
244
245 } // namespace dsp
246 } // namespace libgav1
247
248 #else // !LIBGAV1_ENABLE_NEON
249 namespace libgav1 {
250 namespace dsp {
251
MotionVectorSearchInit_NEON()252 void MotionVectorSearchInit_NEON() {}
253
254 } // namespace dsp
255 } // namespace libgav1
256 #endif // LIBGAV1_ENABLE_NEON
257