1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/motion_vector_search.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_ENABLE_SSE4_1
19
20 #include <smmintrin.h>
21
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25
26 #include "src/dsp/constants.h"
27 #include "src/dsp/dsp.h"
28 #include "src/dsp/x86/common_sse4.h"
29 #include "src/utils/common.h"
30 #include "src/utils/constants.h"
31 #include "src/utils/types.h"
32
33 namespace libgav1 {
34 namespace dsp {
35 namespace {
36
37 constexpr int kProjectionMvDivisionLookup_32bit[kMaxFrameDistance + 1] = {
38 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638,
39 1489, 1365, 1260, 1170, 1092, 1024, 963, 910, 862, 819, 780,
40 744, 712, 682, 655, 630, 606, 585, 564, 546, 528};
41
MvProjection(const __m128i mv,const __m128i denominator,const __m128i numerator)42 inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
43 const __m128i numerator) {
44 const __m128i m0 = _mm_madd_epi16(mv, denominator);
45 const __m128i m = _mm_mullo_epi32(m0, numerator);
46 // Add the sign (0 or -1) to round towards zero.
47 const __m128i sign = _mm_srai_epi32(m, 31);
48 const __m128i add_sign = _mm_add_epi32(m, sign);
49 const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
50 return _mm_srai_epi32(sum, 14);
51 }
52
MvProjectionClip(const __m128i mvs[2],const __m128i denominators[2],const __m128i numerator)53 inline __m128i MvProjectionClip(const __m128i mvs[2],
54 const __m128i denominators[2],
55 const __m128i numerator) {
56 const __m128i s0 = MvProjection(mvs[0], denominators[0], numerator);
57 const __m128i s1 = MvProjection(mvs[1], denominators[1], numerator);
58 const __m128i mv = _mm_packs_epi32(s0, s1);
59 const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
60 const __m128i projection_mv_clamp_negative =
61 _mm_set1_epi16(-kProjectionMvClamp);
62 const __m128i clamp = _mm_min_epi16(mv, projection_mv_clamp);
63 return _mm_max_epi16(clamp, projection_mv_clamp_negative);
64 }
65
MvProjectionCompoundClip(const MotionVector * const temporal_mvs,const int8_t temporal_reference_offsets[2],const int reference_offsets[2])66 inline __m128i MvProjectionCompoundClip(
67 const MotionVector* const temporal_mvs,
68 const int8_t temporal_reference_offsets[2],
69 const int reference_offsets[2]) {
70 const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
71 const __m128i temporal_mv = LoadLo8(tmvs);
72 const __m128i temporal_mv_0 = _mm_cvtepu16_epi32(temporal_mv);
73 __m128i mvs[2], denominators[2];
74 mvs[0] = _mm_unpacklo_epi64(temporal_mv_0, temporal_mv_0);
75 mvs[1] = _mm_unpackhi_epi64(temporal_mv_0, temporal_mv_0);
76 denominators[0] = _mm_set1_epi32(
77 kProjectionMvDivisionLookup[temporal_reference_offsets[0]]);
78 denominators[1] = _mm_set1_epi32(
79 kProjectionMvDivisionLookup[temporal_reference_offsets[1]]);
80 const __m128i offsets = LoadLo8(reference_offsets);
81 const __m128i numerator = _mm_unpacklo_epi32(offsets, offsets);
82 return MvProjectionClip(mvs, denominators, numerator);
83 }
84
MvProjectionSingleClip(const MotionVector * const temporal_mvs,const int8_t * const temporal_reference_offsets,const int reference_offset)85 inline __m128i MvProjectionSingleClip(
86 const MotionVector* const temporal_mvs,
87 const int8_t* const temporal_reference_offsets,
88 const int reference_offset) {
89 const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
90 const __m128i temporal_mv = LoadAligned16(tmvs);
91 __m128i lookup = _mm_cvtsi32_si128(
92 kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[0]]);
93 lookup = _mm_insert_epi32(
94 lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[1]],
95 1);
96 lookup = _mm_insert_epi32(
97 lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[2]],
98 2);
99 lookup = _mm_insert_epi32(
100 lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[3]],
101 3);
102 __m128i mvs[2], denominators[2];
103 mvs[0] = _mm_unpacklo_epi16(temporal_mv, _mm_setzero_si128());
104 mvs[1] = _mm_unpackhi_epi16(temporal_mv, _mm_setzero_si128());
105 denominators[0] = _mm_unpacklo_epi32(lookup, lookup);
106 denominators[1] = _mm_unpackhi_epi32(lookup, lookup);
107 const __m128i numerator = _mm_set1_epi32(reference_offset);
108 return MvProjectionClip(mvs, denominators, numerator);
109 }
110
LowPrecision(const __m128i mv,void * const candidate_mvs)111 inline void LowPrecision(const __m128i mv, void* const candidate_mvs) {
112 const __m128i kRoundDownMask = _mm_set1_epi16(~1);
113 const __m128i sign = _mm_srai_epi16(mv, 15);
114 const __m128i sub_sign = _mm_sub_epi16(mv, sign);
115 const __m128i d = _mm_and_si128(sub_sign, kRoundDownMask);
116 StoreAligned16(candidate_mvs, d);
117 }
118
ForceInteger(const __m128i mv,void * const candidate_mvs)119 inline void ForceInteger(const __m128i mv, void* const candidate_mvs) {
120 const __m128i kRoundDownMask = _mm_set1_epi16(~7);
121 const __m128i sign = _mm_srai_epi16(mv, 15);
122 const __m128i mv1 = _mm_add_epi16(mv, _mm_set1_epi16(3));
123 const __m128i mv2 = _mm_sub_epi16(mv1, sign);
124 const __m128i mv3 = _mm_and_si128(mv2, kRoundDownMask);
125 StoreAligned16(candidate_mvs, mv3);
126 }
127
MvProjectionCompoundLowPrecision_SSE4_1(const MotionVector * temporal_mvs,const int8_t * temporal_reference_offsets,const int reference_offsets[2],const int count,CompoundMotionVector * candidate_mvs)128 void MvProjectionCompoundLowPrecision_SSE4_1(
129 const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
130 const int reference_offsets[2], const int count,
131 CompoundMotionVector* candidate_mvs) {
132 // |reference_offsets| non-zero check usually equals true and is ignored.
133 // To facilitate the compilers, make a local copy of |reference_offsets|.
134 const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
135 // One more element could be calculated.
136 int i = 0;
137 do {
138 const __m128i mv = MvProjectionCompoundClip(
139 temporal_mvs + i, temporal_reference_offsets + i, offsets);
140 LowPrecision(mv, candidate_mvs + i);
141 i += 2;
142 } while (i < count);
143 }
144
MvProjectionCompoundForceInteger_SSE4_1(const MotionVector * temporal_mvs,const int8_t * temporal_reference_offsets,const int reference_offsets[2],const int count,CompoundMotionVector * candidate_mvs)145 void MvProjectionCompoundForceInteger_SSE4_1(
146 const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
147 const int reference_offsets[2], const int count,
148 CompoundMotionVector* candidate_mvs) {
149 // |reference_offsets| non-zero check usually equals true and is ignored.
150 // To facilitate the compilers, make a local copy of |reference_offsets|.
151 const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
152 // One more element could be calculated.
153 int i = 0;
154 do {
155 const __m128i mv = MvProjectionCompoundClip(
156 temporal_mvs + i, temporal_reference_offsets + i, offsets);
157 ForceInteger(mv, candidate_mvs + i);
158 i += 2;
159 } while (i < count);
160 }
161
MvProjectionCompoundHighPrecision_SSE4_1(const MotionVector * temporal_mvs,const int8_t * temporal_reference_offsets,const int reference_offsets[2],const int count,CompoundMotionVector * candidate_mvs)162 void MvProjectionCompoundHighPrecision_SSE4_1(
163 const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
164 const int reference_offsets[2], const int count,
165 CompoundMotionVector* candidate_mvs) {
166 // |reference_offsets| non-zero check usually equals true and is ignored.
167 // To facilitate the compilers, make a local copy of |reference_offsets|.
168 const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
169 // One more element could be calculated.
170 int i = 0;
171 do {
172 const __m128i mv = MvProjectionCompoundClip(
173 temporal_mvs + i, temporal_reference_offsets + i, offsets);
174 StoreAligned16(candidate_mvs + i, mv);
175 i += 2;
176 } while (i < count);
177 }
178
MvProjectionSingleLowPrecision_SSE4_1(const MotionVector * temporal_mvs,const int8_t * temporal_reference_offsets,const int reference_offset,const int count,MotionVector * candidate_mvs)179 void MvProjectionSingleLowPrecision_SSE4_1(
180 const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
181 const int reference_offset, const int count, MotionVector* candidate_mvs) {
182 // Up to three more elements could be calculated.
183 int i = 0;
184 do {
185 const __m128i mv = MvProjectionSingleClip(
186 temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
187 LowPrecision(mv, candidate_mvs + i);
188 i += 4;
189 } while (i < count);
190 }
191
MvProjectionSingleForceInteger_SSE4_1(const MotionVector * temporal_mvs,const int8_t * temporal_reference_offsets,const int reference_offset,const int count,MotionVector * candidate_mvs)192 void MvProjectionSingleForceInteger_SSE4_1(
193 const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
194 const int reference_offset, const int count, MotionVector* candidate_mvs) {
195 // Up to three more elements could be calculated.
196 int i = 0;
197 do {
198 const __m128i mv = MvProjectionSingleClip(
199 temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
200 ForceInteger(mv, candidate_mvs + i);
201 i += 4;
202 } while (i < count);
203 }
204
MvProjectionSingleHighPrecision_SSE4_1(const MotionVector * temporal_mvs,const int8_t * temporal_reference_offsets,const int reference_offset,const int count,MotionVector * candidate_mvs)205 void MvProjectionSingleHighPrecision_SSE4_1(
206 const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
207 const int reference_offset, const int count, MotionVector* candidate_mvs) {
208 // Up to three more elements could be calculated.
209 int i = 0;
210 do {
211 const __m128i mv = MvProjectionSingleClip(
212 temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
213 StoreAligned16(candidate_mvs + i, mv);
214 i += 4;
215 } while (i < count);
216 }
217
Init8bpp()218 void Init8bpp() {
219 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
220 assert(dsp != nullptr);
221 dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
222 dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
223 dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
224 dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
225 dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
226 dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
227 }
228
229 #if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp()230 void Init10bpp() {
231 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
232 assert(dsp != nullptr);
233 dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
234 dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
235 dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
236 dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
237 dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
238 dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
239 }
240 #endif
241
242 } // namespace
243
MotionVectorSearchInit_SSE4_1()244 void MotionVectorSearchInit_SSE4_1() {
245 Init8bpp();
246 #if LIBGAV1_MAX_BITDEPTH >= 10
247 Init10bpp();
248 #endif
249 }
250
251 } // namespace dsp
252 } // namespace libgav1
253
254 #else // !LIBGAV1_ENABLE_SSE4_1
255 namespace libgav1 {
256 namespace dsp {
257
MotionVectorSearchInit_SSE4_1()258 void MotionVectorSearchInit_SSE4_1() {}
259
260 } // namespace dsp
261 } // namespace libgav1
262 #endif // LIBGAV1_ENABLE_SSE4_1
263