1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/loop_restoration.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_TARGETING_SSE4_1
19 #include <smmintrin.h>
20
21 #include <algorithm>
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25 #include <cstring>
26
27 #include "src/dsp/common.h"
28 #include "src/dsp/constants.h"
29 #include "src/dsp/dsp.h"
30 #include "src/dsp/x86/common_sse4.h"
31 #include "src/utils/common.h"
32 #include "src/utils/constants.h"
33
34 namespace libgav1 {
35 namespace dsp {
36 namespace low_bitdepth {
37 namespace {
38
WienerHorizontalClip(const __m128i s[2],const __m128i s_3x128,int16_t * const wiener_buffer)39 inline void WienerHorizontalClip(const __m128i s[2], const __m128i s_3x128,
40 int16_t* const wiener_buffer) {
41 constexpr int offset =
42 1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
43 constexpr int limit =
44 (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
45 const __m128i offsets = _mm_set1_epi16(-offset);
46 const __m128i limits = _mm_set1_epi16(limit - offset);
47 // The sum range here is [-128 * 255 + 4, 90 * 255 + 4].
48 const __m128i sum = _mm_add_epi16(s[0], s[1]);
49 const __m128i rounded_sum0 = _mm_srai_epi16(sum, kInterRoundBitsHorizontal);
50 // Add back scaled down offset correction.
51 const __m128i rounded_sum1 = _mm_add_epi16(rounded_sum0, s_3x128);
52 const __m128i d0 = _mm_max_epi16(rounded_sum1, offsets);
53 const __m128i d1 = _mm_min_epi16(d0, limits);
54 StoreAligned16(wiener_buffer, d1);
55 }
56
WienerHorizontalTap7Kernel(const __m128i s[4],const __m128i filter[4],int16_t * const wiener_buffer)57 inline void WienerHorizontalTap7Kernel(const __m128i s[4],
58 const __m128i filter[4],
59 int16_t* const wiener_buffer) {
60 __m128i madds[4];
61 madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
62 madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
63 madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
64 madds[3] = _mm_maddubs_epi16(s[3], filter[3]);
65 madds[0] = _mm_add_epi16(madds[0], madds[2]);
66 madds[1] = _mm_add_epi16(madds[1], madds[3]);
67 const __m128i s_3x128 =
68 _mm_slli_epi16(_mm_srli_epi16(s[1], 8), 7 - kInterRoundBitsHorizontal);
69 WienerHorizontalClip(madds, s_3x128, wiener_buffer);
70 }
71
WienerHorizontalTap5Kernel(const __m128i s[5],const __m128i filter[3],int16_t * const wiener_buffer)72 inline void WienerHorizontalTap5Kernel(const __m128i s[5],
73 const __m128i filter[3],
74 int16_t* const wiener_buffer) {
75 __m128i madds[3];
76 madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
77 madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
78 madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
79 madds[0] = _mm_add_epi16(madds[0], madds[2]);
80 const __m128i s_3x128 =
81 _mm_srli_epi16(_mm_slli_epi16(s[1], 8), kInterRoundBitsHorizontal + 1);
82 WienerHorizontalClip(madds, s_3x128, wiener_buffer);
83 }
84
WienerHorizontalTap3Kernel(const __m128i s[2],const __m128i filter[2],int16_t * const wiener_buffer)85 inline void WienerHorizontalTap3Kernel(const __m128i s[2],
86 const __m128i filter[2],
87 int16_t* const wiener_buffer) {
88 __m128i madds[2];
89 madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
90 madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
91 const __m128i s_3x128 =
92 _mm_slli_epi16(_mm_srli_epi16(s[0], 8), 7 - kInterRoundBitsHorizontal);
93 WienerHorizontalClip(madds, s_3x128, wiener_buffer);
94 }
95
96 // loading all and unpacking is about 7% faster than using _mm_alignr_epi8().
WienerHorizontalTap7(const uint8_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,const int coefficient0,const __m128i coefficients,int16_t ** const wiener_buffer)97 inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
98 const ptrdiff_t width, const int height,
99 const int coefficient0,
100 const __m128i coefficients,
101 int16_t** const wiener_buffer) {
102 const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
103 __m128i filter[4];
104 filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0200));
105 filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
106 filter[2] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0204));
107 filter[3] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient0));
108 for (int y = height; y != 0; --y) {
109 ptrdiff_t x = 0;
110 do {
111 __m128i s[7], ss[4];
112 s[0] = LoadUnaligned16(src + x + 0);
113 s[1] = LoadUnaligned16(src + x + 1);
114 s[2] = LoadUnaligned16(src + x + 2);
115 s[3] = LoadUnaligned16(src + x + 3);
116 s[4] = LoadUnaligned16(src + x + 4);
117 s[5] = LoadUnaligned16(src + x + 5);
118 s[6] = LoadUnaligned16(src + x + 6);
119 ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
120 ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
121 ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
122 ss[3] = _mm_unpacklo_epi8(s[6], round);
123 WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 0);
124 ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
125 ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
126 ss[2] = _mm_unpackhi_epi8(s[4], s[5]);
127 ss[3] = _mm_unpackhi_epi8(s[6], round);
128 WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 8);
129 x += 16;
130 } while (x < width);
131 src += src_stride;
132 *wiener_buffer += width;
133 }
134 }
135
WienerHorizontalTap5(const uint8_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,const int coefficient1,const __m128i coefficients,int16_t ** const wiener_buffer)136 inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
137 const ptrdiff_t width, const int height,
138 const int coefficient1,
139 const __m128i coefficients,
140 int16_t** const wiener_buffer) {
141 const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
142 __m128i filter[3];
143 filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0402));
144 filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0406));
145 filter[2] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient1));
146 for (int y = height; y != 0; --y) {
147 ptrdiff_t x = 0;
148 do {
149 __m128i s[5], ss[3];
150 s[0] = LoadUnaligned16(src + x + 0);
151 s[1] = LoadUnaligned16(src + x + 1);
152 s[2] = LoadUnaligned16(src + x + 2);
153 s[3] = LoadUnaligned16(src + x + 3);
154 s[4] = LoadUnaligned16(src + x + 4);
155 ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
156 ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
157 ss[2] = _mm_unpacklo_epi8(s[4], round);
158 WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 0);
159 ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
160 ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
161 ss[2] = _mm_unpackhi_epi8(s[4], round);
162 WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 8);
163 x += 16;
164 } while (x < width);
165 src += src_stride;
166 *wiener_buffer += width;
167 }
168 }
169
WienerHorizontalTap3(const uint8_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,const int coefficient2,const __m128i coefficients,int16_t ** const wiener_buffer)170 inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
171 const ptrdiff_t width, const int height,
172 const int coefficient2,
173 const __m128i coefficients,
174 int16_t** const wiener_buffer) {
175 const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
176 __m128i filter[2];
177 filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
178 filter[1] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient2));
179 for (int y = height; y != 0; --y) {
180 ptrdiff_t x = 0;
181 do {
182 __m128i s[3], ss[2];
183 s[0] = LoadUnaligned16(src + x + 0);
184 s[1] = LoadUnaligned16(src + x + 1);
185 s[2] = LoadUnaligned16(src + x + 2);
186 ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
187 ss[1] = _mm_unpacklo_epi8(s[2], round);
188 WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 0);
189 ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
190 ss[1] = _mm_unpackhi_epi8(s[2], round);
191 WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 8);
192 x += 16;
193 } while (x < width);
194 src += src_stride;
195 *wiener_buffer += width;
196 }
197 }
198
WienerHorizontalTap1(const uint8_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,int16_t ** const wiener_buffer)199 inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
200 const ptrdiff_t width, const int height,
201 int16_t** const wiener_buffer) {
202 for (int y = height; y != 0; --y) {
203 ptrdiff_t x = 0;
204 do {
205 const __m128i s = LoadUnaligned16(src + x);
206 const __m128i s0 = _mm_unpacklo_epi8(s, _mm_setzero_si128());
207 const __m128i s1 = _mm_unpackhi_epi8(s, _mm_setzero_si128());
208 const __m128i d0 = _mm_slli_epi16(s0, 4);
209 const __m128i d1 = _mm_slli_epi16(s1, 4);
210 StoreAligned16(*wiener_buffer + x + 0, d0);
211 StoreAligned16(*wiener_buffer + x + 8, d1);
212 x += 16;
213 } while (x < width);
214 src += src_stride;
215 *wiener_buffer += width;
216 }
217 }
218
WienerVertical7(const __m128i a[2],const __m128i filter[2])219 inline __m128i WienerVertical7(const __m128i a[2], const __m128i filter[2]) {
220 const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1));
221 const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
222 const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
223 const __m128i sum0 = _mm_add_epi32(round, madd0);
224 const __m128i sum1 = _mm_add_epi32(sum0, madd1);
225 return _mm_srai_epi32(sum1, kInterRoundBitsVertical);
226 }
227
WienerVertical5(const __m128i a[2],const __m128i filter[2])228 inline __m128i WienerVertical5(const __m128i a[2], const __m128i filter[2]) {
229 const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
230 const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
231 const __m128i sum = _mm_add_epi32(madd0, madd1);
232 return _mm_srai_epi32(sum, kInterRoundBitsVertical);
233 }
234
WienerVertical3(const __m128i a,const __m128i filter)235 inline __m128i WienerVertical3(const __m128i a, const __m128i filter) {
236 const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1));
237 const __m128i madd = _mm_madd_epi16(a, filter);
238 const __m128i sum = _mm_add_epi32(round, madd);
239 return _mm_srai_epi32(sum, kInterRoundBitsVertical);
240 }
241
WienerVerticalFilter7(const __m128i a[7],const __m128i filter[2])242 inline __m128i WienerVerticalFilter7(const __m128i a[7],
243 const __m128i filter[2]) {
244 __m128i b[2];
245 const __m128i a06 = _mm_add_epi16(a[0], a[6]);
246 const __m128i a15 = _mm_add_epi16(a[1], a[5]);
247 const __m128i a24 = _mm_add_epi16(a[2], a[4]);
248 b[0] = _mm_unpacklo_epi16(a06, a15);
249 b[1] = _mm_unpacklo_epi16(a24, a[3]);
250 const __m128i sum0 = WienerVertical7(b, filter);
251 b[0] = _mm_unpackhi_epi16(a06, a15);
252 b[1] = _mm_unpackhi_epi16(a24, a[3]);
253 const __m128i sum1 = WienerVertical7(b, filter);
254 return _mm_packs_epi32(sum0, sum1);
255 }
256
WienerVerticalFilter5(const __m128i a[5],const __m128i filter[2])257 inline __m128i WienerVerticalFilter5(const __m128i a[5],
258 const __m128i filter[2]) {
259 const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
260 __m128i b[2];
261 const __m128i a04 = _mm_add_epi16(a[0], a[4]);
262 const __m128i a13 = _mm_add_epi16(a[1], a[3]);
263 b[0] = _mm_unpacklo_epi16(a04, a13);
264 b[1] = _mm_unpacklo_epi16(a[2], round);
265 const __m128i sum0 = WienerVertical5(b, filter);
266 b[0] = _mm_unpackhi_epi16(a04, a13);
267 b[1] = _mm_unpackhi_epi16(a[2], round);
268 const __m128i sum1 = WienerVertical5(b, filter);
269 return _mm_packs_epi32(sum0, sum1);
270 }
271
WienerVerticalFilter3(const __m128i a[3],const __m128i filter)272 inline __m128i WienerVerticalFilter3(const __m128i a[3], const __m128i filter) {
273 __m128i b;
274 const __m128i a02 = _mm_add_epi16(a[0], a[2]);
275 b = _mm_unpacklo_epi16(a02, a[1]);
276 const __m128i sum0 = WienerVertical3(b, filter);
277 b = _mm_unpackhi_epi16(a02, a[1]);
278 const __m128i sum1 = WienerVertical3(b, filter);
279 return _mm_packs_epi32(sum0, sum1);
280 }
281
WienerVerticalTap7Kernel(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter[2],__m128i a[7])282 inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
283 const ptrdiff_t wiener_stride,
284 const __m128i filter[2], __m128i a[7]) {
285 a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
286 a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
287 a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
288 a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
289 a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
290 a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
291 a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
292 return WienerVerticalFilter7(a, filter);
293 }
294
WienerVerticalTap5Kernel(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter[2],__m128i a[5])295 inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
296 const ptrdiff_t wiener_stride,
297 const __m128i filter[2], __m128i a[5]) {
298 a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
299 a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
300 a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
301 a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
302 a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
303 return WienerVerticalFilter5(a, filter);
304 }
305
WienerVerticalTap3Kernel(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter,__m128i a[3])306 inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
307 const ptrdiff_t wiener_stride,
308 const __m128i filter, __m128i a[3]) {
309 a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
310 a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
311 a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
312 return WienerVerticalFilter3(a, filter);
313 }
314
WienerVerticalTap7Kernel2(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter[2],__m128i d[2])315 inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
316 const ptrdiff_t wiener_stride,
317 const __m128i filter[2], __m128i d[2]) {
318 __m128i a[8];
319 d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
320 a[7] = LoadAligned16(wiener_buffer + 7 * wiener_stride);
321 d[1] = WienerVerticalFilter7(a + 1, filter);
322 }
323
WienerVerticalTap5Kernel2(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter[2],__m128i d[2])324 inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
325 const ptrdiff_t wiener_stride,
326 const __m128i filter[2], __m128i d[2]) {
327 __m128i a[6];
328 d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
329 a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
330 d[1] = WienerVerticalFilter5(a + 1, filter);
331 }
332
WienerVerticalTap3Kernel2(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter,__m128i d[2])333 inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
334 const ptrdiff_t wiener_stride,
335 const __m128i filter, __m128i d[2]) {
336 __m128i a[4];
337 d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
338 a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
339 d[1] = WienerVerticalFilter3(a + 1, filter);
340 }
341
WienerVerticalTap7(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,const int16_t coefficients[4],uint8_t * dst,const ptrdiff_t dst_stride)342 inline void WienerVerticalTap7(const int16_t* wiener_buffer,
343 const ptrdiff_t width, const int height,
344 const int16_t coefficients[4], uint8_t* dst,
345 const ptrdiff_t dst_stride) {
346 const __m128i c = LoadLo8(coefficients);
347 __m128i filter[2];
348 filter[0] = _mm_shuffle_epi32(c, 0x0);
349 filter[1] = _mm_shuffle_epi32(c, 0x55);
350 for (int y = height >> 1; y > 0; --y) {
351 ptrdiff_t x = 0;
352 do {
353 __m128i d[2][2];
354 WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
355 WienerVerticalTap7Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
356 StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
357 StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
358 x += 16;
359 } while (x < width);
360 dst += 2 * dst_stride;
361 wiener_buffer += 2 * width;
362 }
363
364 if ((height & 1) != 0) {
365 ptrdiff_t x = 0;
366 do {
367 __m128i a[7];
368 const __m128i d0 =
369 WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a);
370 const __m128i d1 =
371 WienerVerticalTap7Kernel(wiener_buffer + x + 8, width, filter, a);
372 StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
373 x += 16;
374 } while (x < width);
375 }
376 }
377
WienerVerticalTap5(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,const int16_t coefficients[3],uint8_t * dst,const ptrdiff_t dst_stride)378 inline void WienerVerticalTap5(const int16_t* wiener_buffer,
379 const ptrdiff_t width, const int height,
380 const int16_t coefficients[3], uint8_t* dst,
381 const ptrdiff_t dst_stride) {
382 const __m128i c = Load4(coefficients);
383 __m128i filter[2];
384 filter[0] = _mm_shuffle_epi32(c, 0);
385 filter[1] =
386 _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[2]));
387 for (int y = height >> 1; y > 0; --y) {
388 ptrdiff_t x = 0;
389 do {
390 __m128i d[2][2];
391 WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
392 WienerVerticalTap5Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
393 StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
394 StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
395 x += 16;
396 } while (x < width);
397 dst += 2 * dst_stride;
398 wiener_buffer += 2 * width;
399 }
400
401 if ((height & 1) != 0) {
402 ptrdiff_t x = 0;
403 do {
404 __m128i a[5];
405 const __m128i d0 =
406 WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a);
407 const __m128i d1 =
408 WienerVerticalTap5Kernel(wiener_buffer + x + 8, width, filter, a);
409 StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
410 x += 16;
411 } while (x < width);
412 }
413 }
414
WienerVerticalTap3(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,const int16_t coefficients[2],uint8_t * dst,const ptrdiff_t dst_stride)415 inline void WienerVerticalTap3(const int16_t* wiener_buffer,
416 const ptrdiff_t width, const int height,
417 const int16_t coefficients[2], uint8_t* dst,
418 const ptrdiff_t dst_stride) {
419 const __m128i filter =
420 _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
421 for (int y = height >> 1; y > 0; --y) {
422 ptrdiff_t x = 0;
423 do {
424 __m128i d[2][2];
425 WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
426 WienerVerticalTap3Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
427 StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
428 StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
429 x += 16;
430 } while (x < width);
431 dst += 2 * dst_stride;
432 wiener_buffer += 2 * width;
433 }
434
435 if ((height & 1) != 0) {
436 ptrdiff_t x = 0;
437 do {
438 __m128i a[3];
439 const __m128i d0 =
440 WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a);
441 const __m128i d1 =
442 WienerVerticalTap3Kernel(wiener_buffer + x + 8, width, filter, a);
443 StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
444 x += 16;
445 } while (x < width);
446 }
447 }
448
WienerVerticalTap1Kernel(const int16_t * const wiener_buffer,uint8_t * const dst)449 inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
450 uint8_t* const dst) {
451 const __m128i a0 = LoadAligned16(wiener_buffer + 0);
452 const __m128i a1 = LoadAligned16(wiener_buffer + 8);
453 const __m128i b0 = _mm_add_epi16(a0, _mm_set1_epi16(8));
454 const __m128i b1 = _mm_add_epi16(a1, _mm_set1_epi16(8));
455 const __m128i c0 = _mm_srai_epi16(b0, 4);
456 const __m128i c1 = _mm_srai_epi16(b1, 4);
457 const __m128i d = _mm_packus_epi16(c0, c1);
458 StoreAligned16(dst, d);
459 }
460
WienerVerticalTap1(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,uint8_t * dst,const ptrdiff_t dst_stride)461 inline void WienerVerticalTap1(const int16_t* wiener_buffer,
462 const ptrdiff_t width, const int height,
463 uint8_t* dst, const ptrdiff_t dst_stride) {
464 for (int y = height >> 1; y > 0; --y) {
465 ptrdiff_t x = 0;
466 do {
467 WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
468 WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
469 x += 16;
470 } while (x < width);
471 dst += 2 * dst_stride;
472 wiener_buffer += 2 * width;
473 }
474
475 if ((height & 1) != 0) {
476 ptrdiff_t x = 0;
477 do {
478 WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
479 x += 16;
480 } while (x < width);
481 }
482 }
483
WienerFilter_SSE4_1(const RestorationUnitInfo & LIBGAV1_RESTRICT restoration_info,const void * LIBGAV1_RESTRICT const source,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_border,const ptrdiff_t top_border_stride,const void * LIBGAV1_RESTRICT const bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,RestorationBuffer * LIBGAV1_RESTRICT const restoration_buffer,void * LIBGAV1_RESTRICT const dest)484 void WienerFilter_SSE4_1(
485 const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
486 const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
487 const void* LIBGAV1_RESTRICT const top_border,
488 const ptrdiff_t top_border_stride,
489 const void* LIBGAV1_RESTRICT const bottom_border,
490 const ptrdiff_t bottom_border_stride, const int width, const int height,
491 RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
492 void* LIBGAV1_RESTRICT const dest) {
493 const int16_t* const number_leading_zero_coefficients =
494 restoration_info.wiener_info.number_leading_zero_coefficients;
495 const int number_rows_to_skip = std::max(
496 static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
497 1);
498 const ptrdiff_t wiener_stride = Align(width, 16);
499 int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
500 // The values are saturated to 13 bits before storing.
501 int16_t* wiener_buffer_horizontal =
502 wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
503
504 // horizontal filtering.
505 // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
506 const int height_horizontal =
507 height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
508 const int height_extra = (height_horizontal - height) >> 1;
509 assert(height_extra <= 2);
510 const auto* const src = static_cast<const uint8_t*>(source);
511 const auto* const top = static_cast<const uint8_t*>(top_border);
512 const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
513 const int16_t* const filter_horizontal =
514 restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
515 const __m128i c = LoadLo8(filter_horizontal);
516 // In order to keep the horizontal pass intermediate values within 16 bits we
517 // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
518 const __m128i coefficients_horizontal =
519 _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
520 if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
521 WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
522 top_border_stride, wiener_stride, height_extra,
523 filter_horizontal[0], coefficients_horizontal,
524 &wiener_buffer_horizontal);
525 WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
526 filter_horizontal[0], coefficients_horizontal,
527 &wiener_buffer_horizontal);
528 WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
529 height_extra, filter_horizontal[0],
530 coefficients_horizontal, &wiener_buffer_horizontal);
531 } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
532 WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
533 top_border_stride, wiener_stride, height_extra,
534 filter_horizontal[1], coefficients_horizontal,
535 &wiener_buffer_horizontal);
536 WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
537 filter_horizontal[1], coefficients_horizontal,
538 &wiener_buffer_horizontal);
539 WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
540 height_extra, filter_horizontal[1],
541 coefficients_horizontal, &wiener_buffer_horizontal);
542 } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
543 // The maximum over-reads happen here.
544 WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
545 top_border_stride, wiener_stride, height_extra,
546 filter_horizontal[2], coefficients_horizontal,
547 &wiener_buffer_horizontal);
548 WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
549 filter_horizontal[2], coefficients_horizontal,
550 &wiener_buffer_horizontal);
551 WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
552 height_extra, filter_horizontal[2],
553 coefficients_horizontal, &wiener_buffer_horizontal);
554 } else {
555 assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
556 WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
557 top_border_stride, wiener_stride, height_extra,
558 &wiener_buffer_horizontal);
559 WienerHorizontalTap1(src, stride, wiener_stride, height,
560 &wiener_buffer_horizontal);
561 WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
562 height_extra, &wiener_buffer_horizontal);
563 }
564
565 // vertical filtering.
566 // Over-writes up to 15 values.
567 const int16_t* const filter_vertical =
568 restoration_info.wiener_info.filter[WienerInfo::kVertical];
569 auto* dst = static_cast<uint8_t*>(dest);
570 if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
571 // Because the top row of |source| is a duplicate of the second row, and the
572 // bottom row of |source| is a duplicate of its above row, we can duplicate
573 // the top and bottom row of |wiener_buffer| accordingly.
574 memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
575 sizeof(*wiener_buffer_horizontal) * wiener_stride);
576 memcpy(restoration_buffer->wiener_buffer,
577 restoration_buffer->wiener_buffer + wiener_stride,
578 sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
579 WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
580 filter_vertical, dst, stride);
581 } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
582 WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
583 height, filter_vertical + 1, dst, stride);
584 } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
585 WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
586 wiener_stride, height, filter_vertical + 2, dst, stride);
587 } else {
588 assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
589 WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
590 wiener_stride, height, dst, stride);
591 }
592 }
593
594 //------------------------------------------------------------------------------
595 // SGR
596
597 // SIMD overreads 16 - (width % 16) - 2 * padding pixels, where padding is 3 for
598 // Pass 1 and 2 for Pass 2.
599 constexpr int kOverreadInBytesPass1 = 10;
600 constexpr int kOverreadInBytesPass2 = 12;
601
LoadAligned16x2U16(const uint16_t * const src[2],const ptrdiff_t x,__m128i dst[2])602 inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
603 __m128i dst[2]) {
604 dst[0] = LoadAligned16(src[0] + x);
605 dst[1] = LoadAligned16(src[1] + x);
606 }
607
LoadAligned16x2U16Msan(const uint16_t * const src[2],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[2])608 inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
609 const ptrdiff_t x, const ptrdiff_t border,
610 __m128i dst[2]) {
611 dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
612 dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
613 }
614
LoadAligned16x3U16(const uint16_t * const src[3],const ptrdiff_t x,__m128i dst[3])615 inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
616 __m128i dst[3]) {
617 dst[0] = LoadAligned16(src[0] + x);
618 dst[1] = LoadAligned16(src[1] + x);
619 dst[2] = LoadAligned16(src[2] + x);
620 }
621
LoadAligned16x3U16Msan(const uint16_t * const src[3],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[3])622 inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
623 const ptrdiff_t x, const ptrdiff_t border,
624 __m128i dst[3]) {
625 dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
626 dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
627 dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
628 }
629
LoadAligned32U32(const uint32_t * const src,__m128i dst[2])630 inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
631 dst[0] = LoadAligned16(src + 0);
632 dst[1] = LoadAligned16(src + 4);
633 }
634
LoadAligned32U32Msan(const uint32_t * const src,const ptrdiff_t x,const ptrdiff_t border,__m128i dst[2])635 inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
636 const ptrdiff_t border, __m128i dst[2]) {
637 dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
638 dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
639 }
640
LoadAligned32x2U32(const uint32_t * const src[2],const ptrdiff_t x,__m128i dst[2][2])641 inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
642 __m128i dst[2][2]) {
643 LoadAligned32U32(src[0] + x, dst[0]);
644 LoadAligned32U32(src[1] + x, dst[1]);
645 }
646
LoadAligned32x2U32Msan(const uint32_t * const src[2],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[2][2])647 inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
648 const ptrdiff_t x, const ptrdiff_t border,
649 __m128i dst[2][2]) {
650 LoadAligned32U32Msan(src[0], x, border, dst[0]);
651 LoadAligned32U32Msan(src[1], x, border, dst[1]);
652 }
653
LoadAligned32x3U32(const uint32_t * const src[3],const ptrdiff_t x,__m128i dst[3][2])654 inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
655 __m128i dst[3][2]) {
656 LoadAligned32U32(src[0] + x, dst[0]);
657 LoadAligned32U32(src[1] + x, dst[1]);
658 LoadAligned32U32(src[2] + x, dst[2]);
659 }
660
LoadAligned32x3U32Msan(const uint32_t * const src[3],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[3][2])661 inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
662 const ptrdiff_t x, const ptrdiff_t border,
663 __m128i dst[3][2]) {
664 LoadAligned32U32Msan(src[0], x, border, dst[0]);
665 LoadAligned32U32Msan(src[1], x, border, dst[1]);
666 LoadAligned32U32Msan(src[2], x, border, dst[2]);
667 }
668
StoreAligned32U16(uint16_t * const dst,const __m128i src[2])669 inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
670 StoreAligned16(dst + 0, src[0]);
671 StoreAligned16(dst + 8, src[1]);
672 }
673
StoreAligned32U32(uint32_t * const dst,const __m128i src[2])674 inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
675 StoreAligned16(dst + 0, src[0]);
676 StoreAligned16(dst + 4, src[1]);
677 }
678
StoreAligned64U32(uint32_t * const dst,const __m128i src[4])679 inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
680 StoreAligned32U32(dst + 0, src + 0);
681 StoreAligned32U32(dst + 8, src + 2);
682 }
683
684 // Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
685 // functions. Some compilers may generate super inefficient code and the whole
686 // decoder could be 15% slower.
687
VaddlLo8(const __m128i src0,const __m128i src1)688 inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
689 const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
690 const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
691 return _mm_add_epi16(s0, s1);
692 }
693
VaddlHi8(const __m128i src0,const __m128i src1)694 inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) {
695 const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
696 const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
697 return _mm_add_epi16(s0, s1);
698 }
699
VaddlLo16(const __m128i src0,const __m128i src1)700 inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) {
701 const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
702 const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
703 return _mm_add_epi32(s0, s1);
704 }
705
VaddlHi16(const __m128i src0,const __m128i src1)706 inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) {
707 const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
708 const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
709 return _mm_add_epi32(s0, s1);
710 }
711
VaddwLo8(const __m128i src0,const __m128i src1)712 inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
713 const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
714 return _mm_add_epi16(src0, s1);
715 }
716
VaddwHi8(const __m128i src0,const __m128i src1)717 inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) {
718 const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
719 return _mm_add_epi16(src0, s1);
720 }
721
VaddwLo16(const __m128i src0,const __m128i src1)722 inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) {
723 const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
724 return _mm_add_epi32(src0, s1);
725 }
726
VaddwHi16(const __m128i src0,const __m128i src1)727 inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) {
728 const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
729 return _mm_add_epi32(src0, s1);
730 }
731
VmullNLo8(const __m128i src0,const int src1)732 inline __m128i VmullNLo8(const __m128i src0, const int src1) {
733 const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
734 return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
735 }
736
VmullNHi8(const __m128i src0,const int src1)737 inline __m128i VmullNHi8(const __m128i src0, const int src1) {
738 const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
739 return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
740 }
741
VmullLo16(const __m128i src0,const __m128i src1)742 inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
743 const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
744 const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
745 return _mm_madd_epi16(s0, s1);
746 }
747
VmullHi16(const __m128i src0,const __m128i src1)748 inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
749 const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
750 const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
751 return _mm_madd_epi16(s0, s1);
752 }
753
VrshrS32(const __m128i src0,const int src1)754 inline __m128i VrshrS32(const __m128i src0, const int src1) {
755 const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
756 return _mm_srai_epi32(sum, src1);
757 }
758
VrshrU32(const __m128i src0,const int src1)759 inline __m128i VrshrU32(const __m128i src0, const int src1) {
760 const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
761 return _mm_srli_epi32(sum, src1);
762 }
763
SquareLo8(const __m128i src)764 inline __m128i SquareLo8(const __m128i src) {
765 const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
766 return _mm_mullo_epi16(s, s);
767 }
768
SquareHi8(const __m128i src)769 inline __m128i SquareHi8(const __m128i src) {
770 const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
771 return _mm_mullo_epi16(s, s);
772 }
773
Prepare3Lo8(const __m128i src,__m128i dst[3])774 inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
775 dst[0] = src;
776 dst[1] = _mm_srli_si128(src, 1);
777 dst[2] = _mm_srli_si128(src, 2);
778 }
779
780 template <int offset>
Prepare3_8(const __m128i src[2],__m128i dst[3])781 inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
782 dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
783 dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
784 dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
785 }
786
Prepare3_16(const __m128i src[2],__m128i dst[3])787 inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
788 dst[0] = src[0];
789 dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
790 dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
791 }
792
Prepare5Lo8(const __m128i src,__m128i dst[5])793 inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
794 dst[0] = src;
795 dst[1] = _mm_srli_si128(src, 1);
796 dst[2] = _mm_srli_si128(src, 2);
797 dst[3] = _mm_srli_si128(src, 3);
798 dst[4] = _mm_srli_si128(src, 4);
799 }
800
801 template <int offset>
Prepare5_8(const __m128i src[2],__m128i dst[5])802 inline void Prepare5_8(const __m128i src[2], __m128i dst[5]) {
803 dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
804 dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
805 dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
806 dst[3] = _mm_alignr_epi8(src[1], src[0], offset + 3);
807 dst[4] = _mm_alignr_epi8(src[1], src[0], offset + 4);
808 }
809
Prepare5_16(const __m128i src[2],__m128i dst[5])810 inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
811 Prepare3_16(src, dst);
812 dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
813 dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
814 }
815
Sum3_16(const __m128i src0,const __m128i src1,const __m128i src2)816 inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
817 const __m128i src2) {
818 const __m128i sum = _mm_add_epi16(src0, src1);
819 return _mm_add_epi16(sum, src2);
820 }
821
Sum3_16(const __m128i src[3])822 inline __m128i Sum3_16(const __m128i src[3]) {
823 return Sum3_16(src[0], src[1], src[2]);
824 }
825
Sum3_32(const __m128i src0,const __m128i src1,const __m128i src2)826 inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
827 const __m128i src2) {
828 const __m128i sum = _mm_add_epi32(src0, src1);
829 return _mm_add_epi32(sum, src2);
830 }
831
Sum3_32(const __m128i src[3][2],__m128i dst[2])832 inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
833 dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
834 dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
835 }
836
Sum3WLo16(const __m128i src[3])837 inline __m128i Sum3WLo16(const __m128i src[3]) {
838 const __m128i sum = VaddlLo8(src[0], src[1]);
839 return VaddwLo8(sum, src[2]);
840 }
841
Sum3WHi16(const __m128i src[3])842 inline __m128i Sum3WHi16(const __m128i src[3]) {
843 const __m128i sum = VaddlHi8(src[0], src[1]);
844 return VaddwHi8(sum, src[2]);
845 }
846
Sum3WLo32(const __m128i src[3])847 inline __m128i Sum3WLo32(const __m128i src[3]) {
848 const __m128i sum = VaddlLo16(src[0], src[1]);
849 return VaddwLo16(sum, src[2]);
850 }
851
Sum3WHi32(const __m128i src[3])852 inline __m128i Sum3WHi32(const __m128i src[3]) {
853 const __m128i sum = VaddlHi16(src[0], src[1]);
854 return VaddwHi16(sum, src[2]);
855 }
856
Sum5_16(const __m128i src[5])857 inline __m128i Sum5_16(const __m128i src[5]) {
858 const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
859 const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
860 const __m128i sum = _mm_add_epi16(sum01, sum23);
861 return _mm_add_epi16(sum, src[4]);
862 }
863
Sum5_32(const __m128i * const src0,const __m128i * const src1,const __m128i * const src2,const __m128i * const src3,const __m128i * const src4)864 inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
865 const __m128i* const src2, const __m128i* const src3,
866 const __m128i* const src4) {
867 const __m128i sum01 = _mm_add_epi32(*src0, *src1);
868 const __m128i sum23 = _mm_add_epi32(*src2, *src3);
869 const __m128i sum = _mm_add_epi32(sum01, sum23);
870 return _mm_add_epi32(sum, *src4);
871 }
872
Sum5_32(const __m128i src[5][2],__m128i dst[2])873 inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
874 dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
875 dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
876 }
877
Sum5WLo16(const __m128i src[5])878 inline __m128i Sum5WLo16(const __m128i src[5]) {
879 const __m128i sum01 = VaddlLo8(src[0], src[1]);
880 const __m128i sum23 = VaddlLo8(src[2], src[3]);
881 const __m128i sum = _mm_add_epi16(sum01, sum23);
882 return VaddwLo8(sum, src[4]);
883 }
884
Sum5WHi16(const __m128i src[5])885 inline __m128i Sum5WHi16(const __m128i src[5]) {
886 const __m128i sum01 = VaddlHi8(src[0], src[1]);
887 const __m128i sum23 = VaddlHi8(src[2], src[3]);
888 const __m128i sum = _mm_add_epi16(sum01, sum23);
889 return VaddwHi8(sum, src[4]);
890 }
891
Sum3Horizontal(const __m128i src)892 inline __m128i Sum3Horizontal(const __m128i src) {
893 __m128i s[3];
894 Prepare3Lo8(src, s);
895 return Sum3WLo16(s);
896 }
897
898 template <int offset>
Sum3Horizontal(const __m128i src[2],__m128i dst[2])899 inline void Sum3Horizontal(const __m128i src[2], __m128i dst[2]) {
900 __m128i s[3];
901 Prepare3_8<offset>(src, s);
902 dst[0] = Sum3WLo16(s);
903 dst[1] = Sum3WHi16(s);
904 }
905
Sum3WHorizontal(const __m128i src[2],__m128i dst[2])906 inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
907 __m128i s[3];
908 Prepare3_16(src, s);
909 dst[0] = Sum3WLo32(s);
910 dst[1] = Sum3WHi32(s);
911 }
912
Sum5Horizontal(const __m128i src)913 inline __m128i Sum5Horizontal(const __m128i src) {
914 __m128i s[5];
915 Prepare5Lo8(src, s);
916 return Sum5WLo16(s);
917 }
918
919 template <int offset>
Sum5Horizontal(const __m128i src[2],__m128i * const dst0,__m128i * const dst1)920 inline void Sum5Horizontal(const __m128i src[2], __m128i* const dst0,
921 __m128i* const dst1) {
922 __m128i s[5];
923 Prepare5_8<offset>(src, s);
924 *dst0 = Sum5WLo16(s);
925 *dst1 = Sum5WHi16(s);
926 }
927
Sum5WHorizontal(const __m128i src[2],__m128i dst[2])928 inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
929 __m128i s[5];
930 Prepare5_16(src, s);
931 const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
932 const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
933 const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
934 dst[0] = VaddwLo16(sum0123_lo, s[4]);
935 const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
936 const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
937 const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
938 dst[1] = VaddwHi16(sum0123_hi, s[4]);
939 }
940
SumHorizontalLo(const __m128i src[5],__m128i * const row_sq3,__m128i * const row_sq5)941 void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
942 __m128i* const row_sq5) {
943 const __m128i sum04 = VaddlLo16(src[0], src[4]);
944 *row_sq3 = Sum3WLo32(src + 1);
945 *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
946 }
947
SumHorizontalHi(const __m128i src[5],__m128i * const row_sq3,__m128i * const row_sq5)948 void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
949 __m128i* const row_sq5) {
950 const __m128i sum04 = VaddlHi16(src[0], src[4]);
951 *row_sq3 = Sum3WHi32(src + 1);
952 *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
953 }
954
SumHorizontalLo(const __m128i src,__m128i * const row3,__m128i * const row5)955 void SumHorizontalLo(const __m128i src, __m128i* const row3,
956 __m128i* const row5) {
957 __m128i s[5];
958 Prepare5Lo8(src, s);
959 const __m128i sum04 = VaddlLo8(s[0], s[4]);
960 *row3 = Sum3WLo16(s + 1);
961 *row5 = _mm_add_epi16(sum04, *row3);
962 }
963
964 template <int offset>
SumHorizontal(const __m128i src[2],__m128i * const row3_0,__m128i * const row3_1,__m128i * const row5_0,__m128i * const row5_1)965 void SumHorizontal(const __m128i src[2], __m128i* const row3_0,
966 __m128i* const row3_1, __m128i* const row5_0,
967 __m128i* const row5_1) {
968 __m128i s[5];
969 Prepare5_8<offset>(src, s);
970 const __m128i sum04_lo = VaddlLo8(s[0], s[4]);
971 const __m128i sum04_hi = VaddlHi8(s[0], s[4]);
972 *row3_0 = Sum3WLo16(s + 1);
973 *row3_1 = Sum3WHi16(s + 1);
974 *row5_0 = _mm_add_epi16(sum04_lo, *row3_0);
975 *row5_1 = _mm_add_epi16(sum04_hi, *row3_1);
976 }
977
SumHorizontal(const __m128i src[2],__m128i * const row_sq3_0,__m128i * const row_sq3_1,__m128i * const row_sq5_0,__m128i * const row_sq5_1)978 inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
979 __m128i* const row_sq3_1, __m128i* const row_sq5_0,
980 __m128i* const row_sq5_1) {
981 __m128i s[5];
982 Prepare5_16(src, s);
983 SumHorizontalLo(s, row_sq3_0, row_sq5_0);
984 SumHorizontalHi(s, row_sq3_1, row_sq5_1);
985 }
986
Sum343Lo(const __m128i ma3[3])987 inline __m128i Sum343Lo(const __m128i ma3[3]) {
988 const __m128i sum = Sum3WLo16(ma3);
989 const __m128i sum3 = Sum3_16(sum, sum, sum);
990 return VaddwLo8(sum3, ma3[1]);
991 }
992
Sum343Hi(const __m128i ma3[3])993 inline __m128i Sum343Hi(const __m128i ma3[3]) {
994 const __m128i sum = Sum3WHi16(ma3);
995 const __m128i sum3 = Sum3_16(sum, sum, sum);
996 return VaddwHi8(sum3, ma3[1]);
997 }
998
Sum343WLo(const __m128i src[3])999 inline __m128i Sum343WLo(const __m128i src[3]) {
1000 const __m128i sum = Sum3WLo32(src);
1001 const __m128i sum3 = Sum3_32(sum, sum, sum);
1002 return VaddwLo16(sum3, src[1]);
1003 }
1004
Sum343WHi(const __m128i src[3])1005 inline __m128i Sum343WHi(const __m128i src[3]) {
1006 const __m128i sum = Sum3WHi32(src);
1007 const __m128i sum3 = Sum3_32(sum, sum, sum);
1008 return VaddwHi16(sum3, src[1]);
1009 }
1010
Sum343W(const __m128i src[2],__m128i dst[2])1011 inline void Sum343W(const __m128i src[2], __m128i dst[2]) {
1012 __m128i s[3];
1013 Prepare3_16(src, s);
1014 dst[0] = Sum343WLo(s);
1015 dst[1] = Sum343WHi(s);
1016 }
1017
Sum565Lo(const __m128i src[3])1018 inline __m128i Sum565Lo(const __m128i src[3]) {
1019 const __m128i sum = Sum3WLo16(src);
1020 const __m128i sum4 = _mm_slli_epi16(sum, 2);
1021 const __m128i sum5 = _mm_add_epi16(sum4, sum);
1022 return VaddwLo8(sum5, src[1]);
1023 }
1024
Sum565Hi(const __m128i src[3])1025 inline __m128i Sum565Hi(const __m128i src[3]) {
1026 const __m128i sum = Sum3WHi16(src);
1027 const __m128i sum4 = _mm_slli_epi16(sum, 2);
1028 const __m128i sum5 = _mm_add_epi16(sum4, sum);
1029 return VaddwHi8(sum5, src[1]);
1030 }
1031
Sum565WLo(const __m128i src[3])1032 inline __m128i Sum565WLo(const __m128i src[3]) {
1033 const __m128i sum = Sum3WLo32(src);
1034 const __m128i sum4 = _mm_slli_epi32(sum, 2);
1035 const __m128i sum5 = _mm_add_epi32(sum4, sum);
1036 return VaddwLo16(sum5, src[1]);
1037 }
1038
Sum565WHi(const __m128i src[3])1039 inline __m128i Sum565WHi(const __m128i src[3]) {
1040 const __m128i sum = Sum3WHi32(src);
1041 const __m128i sum4 = _mm_slli_epi32(sum, 2);
1042 const __m128i sum5 = _mm_add_epi32(sum4, sum);
1043 return VaddwHi16(sum5, src[1]);
1044 }
1045
Sum565W(const __m128i src[2],__m128i dst[2])1046 inline void Sum565W(const __m128i src[2], __m128i dst[2]) {
1047 __m128i s[3];
1048 Prepare3_16(src, s);
1049 dst[0] = Sum565WLo(s);
1050 dst[1] = Sum565WHi(s);
1051 }
1052
BoxSum(const uint8_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const ptrdiff_t sum_stride,const ptrdiff_t sum_width,uint16_t * sum3,uint16_t * sum5,uint32_t * square_sum3,uint32_t * square_sum5)1053 inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
1054 const ptrdiff_t width, const ptrdiff_t sum_stride,
1055 const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
1056 uint32_t* square_sum3, uint32_t* square_sum5) {
1057 int y = 2;
1058 do {
1059 __m128i s[2], sq[3];
1060 s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass1 - width);
1061 sq[0] = SquareLo8(s[0]);
1062 ptrdiff_t x = sum_width;
1063 do {
1064 __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
1065 x -= 16;
1066 src += 16;
1067 s[1] = LoadUnaligned16Msan(src,
1068 sum_width - x + kOverreadInBytesPass1 - width);
1069 sq[1] = SquareHi8(s[0]);
1070 sq[2] = SquareLo8(s[1]);
1071 SumHorizontal<0>(s, &row3[0], &row3[1], &row5[0], &row5[1]);
1072 StoreAligned32U16(sum3, row3);
1073 StoreAligned32U16(sum5, row5);
1074 SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
1075 StoreAligned32U32(square_sum3 + 0, row_sq3);
1076 StoreAligned32U32(square_sum5 + 0, row_sq5);
1077 SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
1078 StoreAligned32U32(square_sum3 + 8, row_sq3);
1079 StoreAligned32U32(square_sum5 + 8, row_sq5);
1080 s[0] = s[1];
1081 sq[0] = sq[2];
1082 sum3 += 16;
1083 sum5 += 16;
1084 square_sum3 += 16;
1085 square_sum5 += 16;
1086 } while (x != 0);
1087 src += src_stride - sum_width;
1088 sum3 += sum_stride - sum_width;
1089 sum5 += sum_stride - sum_width;
1090 square_sum3 += sum_stride - sum_width;
1091 square_sum5 += sum_stride - sum_width;
1092 } while (--y != 0);
1093 }
1094
1095 template <int size>
BoxSum(const uint8_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const ptrdiff_t sum_stride,const ptrdiff_t sum_width,uint16_t * sums,uint32_t * square_sums)1096 inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
1097 const ptrdiff_t width, const ptrdiff_t sum_stride,
1098 const ptrdiff_t sum_width, uint16_t* sums,
1099 uint32_t* square_sums) {
1100 static_assert(size == 3 || size == 5, "");
1101 constexpr int kOverreadInBytes =
1102 (size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2;
1103 int y = 2;
1104 do {
1105 __m128i s[2], sq[3];
1106 s[0] = LoadUnaligned16Msan(src, kOverreadInBytes - width);
1107 sq[0] = SquareLo8(s[0]);
1108 ptrdiff_t x = sum_width;
1109 do {
1110 __m128i row[2], row_sq[4];
1111 x -= 16;
1112 src += 16;
1113 s[1] = LoadUnaligned16Msan(src, sum_width - x + kOverreadInBytes - width);
1114 sq[1] = SquareHi8(s[0]);
1115 sq[2] = SquareLo8(s[1]);
1116 if (size == 3) {
1117 Sum3Horizontal<0>(s, row);
1118 Sum3WHorizontal(sq + 0, row_sq + 0);
1119 Sum3WHorizontal(sq + 1, row_sq + 2);
1120 } else {
1121 Sum5Horizontal<0>(s, &row[0], &row[1]);
1122 Sum5WHorizontal(sq + 0, row_sq + 0);
1123 Sum5WHorizontal(sq + 1, row_sq + 2);
1124 }
1125 StoreAligned32U16(sums, row);
1126 StoreAligned64U32(square_sums, row_sq);
1127 s[0] = s[1];
1128 sq[0] = sq[2];
1129 sums += 16;
1130 square_sums += 16;
1131 } while (x != 0);
1132 src += src_stride - sum_width;
1133 sums += sum_stride - sum_width;
1134 square_sums += sum_stride - sum_width;
1135 } while (--y != 0);
1136 }
1137
1138 template <int n>
CalculateMa(const __m128i sum,const __m128i sum_sq,const uint32_t scale)1139 inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
1140 const uint32_t scale) {
1141 static_assert(n == 9 || n == 25, "");
1142 // a = |sum_sq|
1143 // d = |sum|
1144 // p = (a * n < d * d) ? 0 : a * n - d * d;
1145 const __m128i dxd = _mm_madd_epi16(sum, sum);
1146 // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
1147 // Some compilers could do this for us but we make this explicit.
1148 // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
1149 __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
1150 if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
1151 const __m128i sub = _mm_sub_epi32(axn, dxd);
1152 const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
1153 const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
1154 return VrshrU32(pxs, kSgrProjScaleBits);
1155 }
1156
1157 template <int n>
CalculateMa(const __m128i sum,const __m128i sum_sq[2],const uint32_t scale)1158 inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
1159 const uint32_t scale) {
1160 static_assert(n == 9 || n == 25, "");
1161 const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
1162 const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
1163 const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
1164 const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
1165 return _mm_packus_epi32(z0, z1);
1166 }
1167
CalculateB5(const __m128i sum,const __m128i ma)1168 inline __m128i CalculateB5(const __m128i sum, const __m128i ma) {
1169 // one_over_n == 164.
1170 constexpr uint32_t one_over_n =
1171 ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
1172 // one_over_n_quarter == 41.
1173 constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
1174 static_assert(one_over_n == one_over_n_quarter << 2, "");
1175 // |ma| is in range [0, 255].
1176 const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
1177 const __m128i m0 = VmullLo16(m, sum);
1178 const __m128i m1 = VmullHi16(m, sum);
1179 const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
1180 const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
1181 return _mm_packus_epi32(b_lo, b_hi);
1182 }
1183
CalculateB3(const __m128i sum,const __m128i ma)1184 inline __m128i CalculateB3(const __m128i sum, const __m128i ma) {
1185 // one_over_n == 455.
1186 constexpr uint32_t one_over_n =
1187 ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
1188 const __m128i m0 = VmullLo16(ma, sum);
1189 const __m128i m1 = VmullHi16(ma, sum);
1190 const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
1191 const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
1192 const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
1193 const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
1194 return _mm_packus_epi32(b_lo, b_hi);
1195 }
1196
CalculateSumAndIndex5(const __m128i s5[5],const __m128i sq5[5][2],const uint32_t scale,__m128i * const sum,__m128i * const index)1197 inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
1198 const uint32_t scale, __m128i* const sum,
1199 __m128i* const index) {
1200 __m128i sum_sq[2];
1201 *sum = Sum5_16(s5);
1202 Sum5_32(sq5, sum_sq);
1203 *index = CalculateMa<25>(*sum, sum_sq, scale);
1204 }
1205
CalculateSumAndIndex3(const __m128i s3[3],const __m128i sq3[3][2],const uint32_t scale,__m128i * const sum,__m128i * const index)1206 inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
1207 const uint32_t scale, __m128i* const sum,
1208 __m128i* const index) {
1209 __m128i sum_sq[2];
1210 *sum = Sum3_16(s3);
1211 Sum3_32(sq3, sum_sq);
1212 *index = CalculateMa<9>(*sum, sum_sq, scale);
1213 }
1214
1215 template <int n, int offset>
LookupIntermediate(const __m128i sum,const __m128i index,__m128i * const ma,__m128i * const b)1216 inline void LookupIntermediate(const __m128i sum, const __m128i index,
1217 __m128i* const ma, __m128i* const b) {
1218 static_assert(n == 9 || n == 25, "");
1219 static_assert(offset == 0 || offset == 8, "");
1220 const __m128i idx = _mm_packus_epi16(index, index);
1221 // Actually it's not stored and loaded. The compiler will use a 64-bit
1222 // general-purpose register to process. Faster than using _mm_extract_epi8().
1223 uint8_t temp[8];
1224 StoreLo8(temp, idx);
1225 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
1226 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
1227 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
1228 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
1229 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
1230 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
1231 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
1232 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
1233 // b = ma * b * one_over_n
1234 // |ma| = [0, 255]
1235 // |sum| is a box sum with radius 1 or 2.
1236 // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
1237 // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
1238 // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
1239 // When radius is 2 |n| is 25. |one_over_n| is 164.
1240 // When radius is 1 |n| is 9. |one_over_n| is 455.
1241 // |kSgrProjReciprocalBits| is 12.
1242 // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
1243 // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
1244 __m128i maq;
1245 if (offset == 0) {
1246 maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
1247 } else {
1248 maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
1249 }
1250 *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq);
1251 }
1252
1253 // Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
1254 // to get value 0 as the shuffle result. The most significiant bit 1 comes
1255 // either from the comparison instruction, or from the sign bit of the index.
ShuffleIndex(const __m128i table,const __m128i index)1256 inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
1257 __m128i mask;
1258 mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
1259 mask = _mm_or_si128(mask, index);
1260 return _mm_shuffle_epi8(table, mask);
1261 }
1262
AdjustValue(const __m128i value,const __m128i index,const int threshold)1263 inline __m128i AdjustValue(const __m128i value, const __m128i index,
1264 const int threshold) {
1265 const __m128i thresholds = _mm_set1_epi8(threshold - 128);
1266 const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
1267 return _mm_add_epi8(value, offset);
1268 }
1269
CalculateIntermediate(const __m128i sum[2],const __m128i index[2],__m128i * const ma,__m128i * const b0,__m128i * const b1)1270 inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
1271 __m128i* const ma, __m128i* const b0,
1272 __m128i* const b1) {
1273 // Use table lookup to read elements whose indices are less than 48.
1274 const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
1275 const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
1276 const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
1277 const __m128i indices = _mm_packus_epi16(index[0], index[1]);
1278 __m128i idx;
1279 // Clip idx to 127 to apply signed comparison instructions.
1280 idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
1281 // All elements whose indices are less than 48 are set to 0.
1282 // Get shuffle results for indices in range [0, 15].
1283 *ma = ShuffleIndex(c0, idx);
1284 // Get shuffle results for indices in range [16, 31].
1285 // Subtract 16 to utilize the sign bit of the index.
1286 idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
1287 const __m128i res1 = ShuffleIndex(c1, idx);
1288 // Use OR instruction to combine shuffle results together.
1289 *ma = _mm_or_si128(*ma, res1);
1290 // Get shuffle results for indices in range [32, 47].
1291 // Subtract 16 to utilize the sign bit of the index.
1292 idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
1293 const __m128i res2 = ShuffleIndex(c2, idx);
1294 *ma = _mm_or_si128(*ma, res2);
1295
1296 // For elements whose indices are larger than 47, since they seldom change
1297 // values with the increase of the index, we use comparison and arithmetic
1298 // operations to calculate their values.
1299 // Add -128 to apply signed comparison instructions.
1300 idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
1301 // Elements whose indices are larger than 47 (with value 0) are set to 5.
1302 *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
1303 *ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5.
1304 *ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4.
1305 *ma = AdjustValue(*ma, idx, 101); // 101 is the last index which value is 3.
1306 *ma = AdjustValue(*ma, idx, 169); // 169 is the last index which value is 2.
1307 *ma = AdjustValue(*ma, idx, 254); // 254 is the last index which value is 1.
1308
1309 // b = ma * b * one_over_n
1310 // |ma| = [0, 255]
1311 // |sum| is a box sum with radius 1 or 2.
1312 // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
1313 // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
1314 // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
1315 // When radius is 2 |n| is 25. |one_over_n| is 164.
1316 // When radius is 1 |n| is 9. |one_over_n| is 455.
1317 // |kSgrProjReciprocalBits| is 12.
1318 // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
1319 // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
1320 const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
1321 *b0 = CalculateB3(sum[0], maq0);
1322 const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
1323 *b1 = CalculateB3(sum[1], maq1);
1324 }
1325
CalculateIntermediate(const __m128i sum[2],const __m128i index[2],__m128i ma[2],__m128i b[2])1326 inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
1327 __m128i ma[2], __m128i b[2]) {
1328 __m128i mas;
1329 CalculateIntermediate(sum, index, &mas, &b[0], &b[1]);
1330 ma[0] = _mm_unpacklo_epi64(ma[0], mas);
1331 ma[1] = _mm_srli_si128(mas, 8);
1332 }
1333
1334 // Note: It has been tried to call CalculateIntermediate() to replace the slow
1335 // LookupIntermediate() when calculating 16 intermediate data points. However,
1336 // the compiler generates even slower code.
1337 template <int offset>
CalculateIntermediate5(const __m128i s5[5],const __m128i sq5[5][2],const uint32_t scale,__m128i * const ma,__m128i * const b)1338 inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
1339 const uint32_t scale, __m128i* const ma,
1340 __m128i* const b) {
1341 static_assert(offset == 0 || offset == 8, "");
1342 __m128i sum, index;
1343 CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
1344 LookupIntermediate<25, offset>(sum, index, ma, b);
1345 }
1346
CalculateIntermediate3(const __m128i s3[3],const __m128i sq3[3][2],const uint32_t scale,__m128i * const ma,__m128i * const b)1347 inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
1348 const uint32_t scale, __m128i* const ma,
1349 __m128i* const b) {
1350 __m128i sum, index;
1351 CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
1352 LookupIntermediate<9, 0>(sum, index, ma, b);
1353 }
1354
Store343_444(const __m128i b3[2],const ptrdiff_t x,__m128i sum_b343[2],__m128i sum_b444[2],uint32_t * const b343,uint32_t * const b444)1355 inline void Store343_444(const __m128i b3[2], const ptrdiff_t x,
1356 __m128i sum_b343[2], __m128i sum_b444[2],
1357 uint32_t* const b343, uint32_t* const b444) {
1358 __m128i b[3], sum_b111[2];
1359 Prepare3_16(b3, b);
1360 sum_b111[0] = Sum3WLo32(b);
1361 sum_b111[1] = Sum3WHi32(b);
1362 sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
1363 sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
1364 StoreAligned32U32(b444 + x, sum_b444);
1365 sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
1366 sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
1367 sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
1368 sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
1369 StoreAligned32U32(b343 + x, sum_b343);
1370 }
1371
Store343_444Lo(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,__m128i * const sum_ma343,__m128i * const sum_ma444,__m128i sum_b343[2],__m128i sum_b444[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1372 inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
1373 const ptrdiff_t x, __m128i* const sum_ma343,
1374 __m128i* const sum_ma444, __m128i sum_b343[2],
1375 __m128i sum_b444[2], uint16_t* const ma343,
1376 uint16_t* const ma444, uint32_t* const b343,
1377 uint32_t* const b444) {
1378 const __m128i sum_ma111 = Sum3WLo16(ma3);
1379 *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
1380 StoreAligned16(ma444 + x, *sum_ma444);
1381 const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
1382 *sum_ma343 = VaddwLo8(sum333, ma3[1]);
1383 StoreAligned16(ma343 + x, *sum_ma343);
1384 Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
1385 }
1386
Store343_444Hi(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,__m128i * const sum_ma343,__m128i * const sum_ma444,__m128i sum_b343[2],__m128i sum_b444[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1387 inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
1388 const ptrdiff_t x, __m128i* const sum_ma343,
1389 __m128i* const sum_ma444, __m128i sum_b343[2],
1390 __m128i sum_b444[2], uint16_t* const ma343,
1391 uint16_t* const ma444, uint32_t* const b343,
1392 uint32_t* const b444) {
1393 const __m128i sum_ma111 = Sum3WHi16(ma3);
1394 *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
1395 StoreAligned16(ma444 + x, *sum_ma444);
1396 const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
1397 *sum_ma343 = VaddwHi8(sum333, ma3[1]);
1398 StoreAligned16(ma343 + x, *sum_ma343);
1399 Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
1400 }
1401
Store343_444Lo(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,__m128i * const sum_ma343,__m128i sum_b343[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1402 inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
1403 const ptrdiff_t x, __m128i* const sum_ma343,
1404 __m128i sum_b343[2], uint16_t* const ma343,
1405 uint16_t* const ma444, uint32_t* const b343,
1406 uint32_t* const b444) {
1407 __m128i sum_ma444, sum_b444[2];
1408 Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
1409 ma444, b343, b444);
1410 }
1411
Store343_444Hi(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,__m128i * const sum_ma343,__m128i sum_b343[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1412 inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
1413 const ptrdiff_t x, __m128i* const sum_ma343,
1414 __m128i sum_b343[2], uint16_t* const ma343,
1415 uint16_t* const ma444, uint32_t* const b343,
1416 uint32_t* const b444) {
1417 __m128i sum_ma444, sum_b444[2];
1418 Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
1419 ma444, b343, b444);
1420 }
1421
Store343_444Lo(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1422 inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
1423 const ptrdiff_t x, uint16_t* const ma343,
1424 uint16_t* const ma444, uint32_t* const b343,
1425 uint32_t* const b444) {
1426 __m128i sum_ma343, sum_b343[2];
1427 Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
1428 }
1429
Store343_444Hi(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1430 inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
1431 const ptrdiff_t x, uint16_t* const ma343,
1432 uint16_t* const ma444, uint32_t* const b343,
1433 uint32_t* const b444) {
1434 __m128i sum_ma343, sum_b343[2];
1435 Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
1436 }
1437
BoxFilterPreProcess5Lo(const __m128i s[2][2],const uint32_t scale,uint16_t * const sum5[5],uint32_t * const square_sum5[5],__m128i sq[2][4],__m128i * const ma,__m128i * const b)1438 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
1439 const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5],
1440 uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma,
1441 __m128i* const b) {
1442 __m128i s5[2][5], sq5[5][2];
1443 sq[0][1] = SquareHi8(s[0][0]);
1444 sq[1][1] = SquareHi8(s[1][0]);
1445 s5[0][3] = Sum5Horizontal(s[0][0]);
1446 StoreAligned16(sum5[3], s5[0][3]);
1447 s5[0][4] = Sum5Horizontal(s[1][0]);
1448 StoreAligned16(sum5[4], s5[0][4]);
1449 Sum5WHorizontal(sq[0], sq5[3]);
1450 StoreAligned32U32(square_sum5[3], sq5[3]);
1451 Sum5WHorizontal(sq[1], sq5[4]);
1452 StoreAligned32U32(square_sum5[4], sq5[4]);
1453 LoadAligned16x3U16(sum5, 0, s5[0]);
1454 LoadAligned32x3U32(square_sum5, 0, sq5);
1455 CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
1456 }
1457
BoxFilterPreProcess5(const __m128i s[2][2],const ptrdiff_t sum_width,const ptrdiff_t x,const uint32_t scale,uint16_t * const sum5[5],uint32_t * const square_sum5[5],__m128i sq[2][4],__m128i ma[2],__m128i b[3])1458 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
1459 const __m128i s[2][2], const ptrdiff_t sum_width, const ptrdiff_t x,
1460 const uint32_t scale, uint16_t* const sum5[5],
1461 uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma[2],
1462 __m128i b[3]) {
1463 __m128i s5[2][5], sq5[5][2];
1464 sq[0][2] = SquareLo8(s[0][1]);
1465 sq[1][2] = SquareLo8(s[1][1]);
1466 Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
1467 StoreAligned16(sum5[3] + x + 0, s5[0][3]);
1468 StoreAligned16(sum5[3] + x + 8, s5[1][3]);
1469 Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
1470 StoreAligned16(sum5[4] + x + 0, s5[0][4]);
1471 StoreAligned16(sum5[4] + x + 8, s5[1][4]);
1472 Sum5WHorizontal(sq[0] + 1, sq5[3]);
1473 StoreAligned32U32(square_sum5[3] + x, sq5[3]);
1474 Sum5WHorizontal(sq[1] + 1, sq5[4]);
1475 StoreAligned32U32(square_sum5[4] + x, sq5[4]);
1476 LoadAligned16x3U16(sum5, x, s5[0]);
1477 LoadAligned32x3U32(square_sum5, x, sq5);
1478 CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
1479
1480 sq[0][3] = SquareHi8(s[0][1]);
1481 sq[1][3] = SquareHi8(s[1][1]);
1482 Sum5WHorizontal(sq[0] + 2, sq5[3]);
1483 StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
1484 Sum5WHorizontal(sq[1] + 2, sq5[4]);
1485 StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
1486 LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1487 LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1488 CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
1489 }
1490
BoxFilterPreProcess5LastRowLo(const __m128i s,const uint32_t scale,const uint16_t * const sum5[5],const uint32_t * const square_sum5[5],__m128i sq[2],__m128i * const ma,__m128i * const b)1491 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
1492 const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
1493 const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
1494 __m128i* const b) {
1495 __m128i s5[5], sq5[5][2];
1496 sq[1] = SquareHi8(s);
1497 s5[3] = s5[4] = Sum5Horizontal(s);
1498 Sum5WHorizontal(sq, sq5[3]);
1499 sq5[4][0] = sq5[3][0];
1500 sq5[4][1] = sq5[3][1];
1501 LoadAligned16x3U16(sum5, 0, s5);
1502 LoadAligned32x3U32(square_sum5, 0, sq5);
1503 CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
1504 }
1505
BoxFilterPreProcess5LastRow(const __m128i s[2],const ptrdiff_t sum_width,const ptrdiff_t x,const uint32_t scale,const uint16_t * const sum5[5],const uint32_t * const square_sum5[5],__m128i sq[4],__m128i ma[2],__m128i b[3])1506 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
1507 const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
1508 const uint32_t scale, const uint16_t* const sum5[5],
1509 const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma[2],
1510 __m128i b[3]) {
1511 __m128i s5[2][5], sq5[5][2];
1512 sq[2] = SquareLo8(s[1]);
1513 Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
1514 s5[0][4] = s5[0][3];
1515 s5[1][4] = s5[1][3];
1516 Sum5WHorizontal(sq + 1, sq5[3]);
1517 sq5[4][0] = sq5[3][0];
1518 sq5[4][1] = sq5[3][1];
1519 LoadAligned16x3U16(sum5, x, s5[0]);
1520 LoadAligned32x3U32(square_sum5, x, sq5);
1521 CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
1522
1523 sq[3] = SquareHi8(s[1]);
1524 Sum5WHorizontal(sq + 2, sq5[3]);
1525 sq5[4][0] = sq5[3][0];
1526 sq5[4][1] = sq5[3][1];
1527 LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1528 LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1529 CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
1530 }
1531
BoxFilterPreProcess3Lo(const __m128i s,const uint32_t scale,uint16_t * const sum3[3],uint32_t * const square_sum3[3],__m128i sq[2],__m128i * const ma,__m128i * const b)1532 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
1533 const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
1534 uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
1535 __m128i* const b) {
1536 __m128i s3[3], sq3[3][2];
1537 sq[1] = SquareHi8(s);
1538 s3[2] = Sum3Horizontal(s);
1539 StoreAligned16(sum3[2], s3[2]);
1540 Sum3WHorizontal(sq, sq3[2]);
1541 StoreAligned32U32(square_sum3[2], sq3[2]);
1542 LoadAligned16x2U16(sum3, 0, s3);
1543 LoadAligned32x2U32(square_sum3, 0, sq3);
1544 CalculateIntermediate3(s3, sq3, scale, ma, b);
1545 }
1546
BoxFilterPreProcess3(const __m128i s[2],const ptrdiff_t x,const ptrdiff_t sum_width,const uint32_t scale,uint16_t * const sum3[3],uint32_t * const square_sum3[3],__m128i sq[4],__m128i ma[2],__m128i b[3])1547 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
1548 const __m128i s[2], const ptrdiff_t x, const ptrdiff_t sum_width,
1549 const uint32_t scale, uint16_t* const sum3[3],
1550 uint32_t* const square_sum3[3], __m128i sq[4], __m128i ma[2],
1551 __m128i b[3]) {
1552 __m128i s3[4], sq3[3][2], sum[2], index[2];
1553 sq[2] = SquareLo8(s[1]);
1554 Sum3Horizontal<8>(s, s3 + 2);
1555 StoreAligned32U16(sum3[2] + x, s3 + 2);
1556 Sum3WHorizontal(sq + 1, sq3[2]);
1557 StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
1558 LoadAligned16x2U16(sum3, x, s3);
1559 LoadAligned32x2U32(square_sum3, x, sq3);
1560 CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
1561
1562 sq[3] = SquareHi8(s[1]);
1563 Sum3WHorizontal(sq + 2, sq3[2]);
1564 StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
1565 LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
1566 LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
1567 CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
1568 CalculateIntermediate(sum, index, ma, b + 1);
1569 }
1570
BoxFilterPreProcessLo(const __m128i s[2][2],const uint16_t scales[2],uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],__m128i sq[2][4],__m128i ma3[2][2],__m128i b3[2][3],__m128i * const ma5,__m128i * const b5)1571 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
1572 const __m128i s[2][2], const uint16_t scales[2], uint16_t* const sum3[4],
1573 uint16_t* const sum5[5], uint32_t* const square_sum3[4],
1574 uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma3[2][2],
1575 __m128i b3[2][3], __m128i* const ma5, __m128i* const b5) {
1576 __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
1577 sq[0][1] = SquareHi8(s[0][0]);
1578 sq[1][1] = SquareHi8(s[1][0]);
1579 SumHorizontalLo(s[0][0], &s3[2], &s5[3]);
1580 SumHorizontalLo(s[1][0], &s3[3], &s5[4]);
1581 StoreAligned16(sum3[2], s3[2]);
1582 StoreAligned16(sum3[3], s3[3]);
1583 StoreAligned16(sum5[3], s5[3]);
1584 StoreAligned16(sum5[4], s5[4]);
1585 SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1586 StoreAligned32U32(square_sum3[2], sq3[2]);
1587 StoreAligned32U32(square_sum5[3], sq5[3]);
1588 SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
1589 StoreAligned32U32(square_sum3[3], sq3[3]);
1590 StoreAligned32U32(square_sum5[4], sq5[4]);
1591 LoadAligned16x2U16(sum3, 0, s3);
1592 LoadAligned32x2U32(square_sum3, 0, sq3);
1593 LoadAligned16x3U16(sum5, 0, s5);
1594 LoadAligned32x3U32(square_sum5, 0, sq5);
1595 CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
1596 CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
1597 CalculateIntermediate(sum, index, &ma3[0][0], &b3[0][0], &b3[1][0]);
1598 ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
1599 CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
1600 }
1601
BoxFilterPreProcess(const __m128i s[2][2],const ptrdiff_t x,const uint16_t scales[2],uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,__m128i sq[2][4],__m128i ma3[2][2],__m128i b3[2][3],__m128i ma5[2],__m128i b5[3])1602 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
1603 const __m128i s[2][2], const ptrdiff_t x, const uint16_t scales[2],
1604 uint16_t* const sum3[4], uint16_t* const sum5[5],
1605 uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
1606 const ptrdiff_t sum_width, __m128i sq[2][4], __m128i ma3[2][2],
1607 __m128i b3[2][3], __m128i ma5[2], __m128i b5[3]) {
1608 __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
1609 SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
1610 StoreAligned16(sum3[2] + x + 0, s3[0][2]);
1611 StoreAligned16(sum3[2] + x + 8, s3[1][2]);
1612 StoreAligned16(sum5[3] + x + 0, s5[0][3]);
1613 StoreAligned16(sum5[3] + x + 8, s5[1][3]);
1614 SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
1615 StoreAligned16(sum3[3] + x + 0, s3[0][3]);
1616 StoreAligned16(sum3[3] + x + 8, s3[1][3]);
1617 StoreAligned16(sum5[4] + x + 0, s5[0][4]);
1618 StoreAligned16(sum5[4] + x + 8, s5[1][4]);
1619 sq[0][2] = SquareLo8(s[0][1]);
1620 sq[1][2] = SquareLo8(s[1][1]);
1621 SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1622 StoreAligned32U32(square_sum3[2] + x, sq3[2]);
1623 StoreAligned32U32(square_sum5[3] + x, sq5[3]);
1624 SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
1625 StoreAligned32U32(square_sum3[3] + x, sq3[3]);
1626 StoreAligned32U32(square_sum5[4] + x, sq5[4]);
1627 LoadAligned16x2U16(sum3, x, s3[0]);
1628 LoadAligned32x2U32(square_sum3, x, sq3);
1629 CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
1630 CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
1631 &index[1][0]);
1632 LoadAligned16x3U16(sum5, x, s5[0]);
1633 LoadAligned32x3U32(square_sum5, x, sq5);
1634 CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[1]);
1635
1636 sq[0][3] = SquareHi8(s[0][1]);
1637 sq[1][3] = SquareHi8(s[1][1]);
1638 SumHorizontal(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1639 StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
1640 StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
1641 SumHorizontal(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
1642 StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
1643 StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
1644 LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
1645 LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
1646 CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
1647 CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
1648 &index[1][1]);
1649 CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 1);
1650 CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 1);
1651 LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1652 LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1653 CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[2]);
1654 }
1655
BoxFilterPreProcessLastRowLo(const __m128i s,const uint16_t scales[2],const uint16_t * const sum3[4],const uint16_t * const sum5[5],const uint32_t * const square_sum3[4],const uint32_t * const square_sum5[5],__m128i sq[2],__m128i * const ma3,__m128i * const ma5,__m128i * const b3,__m128i * const b5)1656 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
1657 const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
1658 const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
1659 const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
1660 __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
1661 __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
1662 sq[1] = SquareHi8(s);
1663 SumHorizontalLo(s, &s3[2], &s5[3]);
1664 SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1665 LoadAligned16x3U16(sum5, 0, s5);
1666 s5[4] = s5[3];
1667 LoadAligned32x3U32(square_sum5, 0, sq5);
1668 sq5[4][0] = sq5[3][0];
1669 sq5[4][1] = sq5[3][1];
1670 CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
1671 LoadAligned16x2U16(sum3, 0, s3);
1672 LoadAligned32x2U32(square_sum3, 0, sq3);
1673 CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
1674 }
1675
BoxFilterPreProcessLastRow(const __m128i s[2],const ptrdiff_t sum_width,const ptrdiff_t x,const uint16_t scales[2],const uint16_t * const sum3[4],const uint16_t * const sum5[5],const uint32_t * const square_sum3[4],const uint32_t * const square_sum5[5],__m128i sq[4],__m128i ma3[2],__m128i ma5[2],__m128i b3[3],__m128i b5[3])1676 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
1677 const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
1678 const uint16_t scales[2], const uint16_t* const sum3[4],
1679 const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
1680 const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma3[2],
1681 __m128i ma5[2], __m128i b3[3], __m128i b5[3]) {
1682 __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
1683 sq[2] = SquareLo8(s[1]);
1684 SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
1685 SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1686 LoadAligned16x3U16(sum5, x, s5[0]);
1687 s5[0][4] = s5[0][3];
1688 LoadAligned32x3U32(square_sum5, x, sq5);
1689 sq5[4][0] = sq5[3][0];
1690 sq5[4][1] = sq5[3][1];
1691 CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 1);
1692 LoadAligned16x2U16(sum3, x, s3[0]);
1693 LoadAligned32x2U32(square_sum3, x, sq3);
1694 CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
1695
1696 sq[3] = SquareHi8(s[1]);
1697 SumHorizontal(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1698 LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1699 s5[1][4] = s5[1][3];
1700 LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1701 sq5[4][0] = sq5[3][0];
1702 sq5[4][1] = sq5[3][1];
1703 CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 2);
1704 LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
1705 LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
1706 CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
1707 CalculateIntermediate(sum, index, ma3, b3 + 1);
1708 }
1709
BoxSumFilterPreProcess5(const uint8_t * const src0,const uint8_t * const src1,const int width,const uint32_t scale,uint16_t * const sum5[5],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,uint16_t * ma565,uint32_t * b565)1710 inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
1711 const uint8_t* const src1, const int width,
1712 const uint32_t scale,
1713 uint16_t* const sum5[5],
1714 uint32_t* const square_sum5[5],
1715 const ptrdiff_t sum_width, uint16_t* ma565,
1716 uint32_t* b565) {
1717 __m128i s[2][2], mas[2], sq[2][4], bs[3];
1718 s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
1719 s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
1720 sq[0][0] = SquareLo8(s[0][0]);
1721 sq[1][0] = SquareLo8(s[1][0]);
1722 BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
1723
1724 int x = 0;
1725 do {
1726 __m128i ma5[3], ma[2], b[4];
1727 s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
1728 x + 16 + kOverreadInBytesPass1 - width);
1729 s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
1730 x + 16 + kOverreadInBytesPass1 - width);
1731 BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
1732 bs);
1733 Prepare3_8<0>(mas, ma5);
1734 ma[0] = Sum565Lo(ma5);
1735 ma[1] = Sum565Hi(ma5);
1736 StoreAligned32U16(ma565, ma);
1737 Sum565W(bs + 0, b + 0);
1738 Sum565W(bs + 1, b + 2);
1739 StoreAligned64U32(b565, b);
1740 s[0][0] = s[0][1];
1741 s[1][0] = s[1][1];
1742 sq[0][1] = sq[0][3];
1743 sq[1][1] = sq[1][3];
1744 mas[0] = mas[1];
1745 bs[0] = bs[2];
1746 ma565 += 16;
1747 b565 += 16;
1748 x += 16;
1749 } while (x < width);
1750 }
1751
1752 template <bool calculate444>
BoxSumFilterPreProcess3(const uint8_t * const src,const int width,const uint32_t scale,uint16_t * const sum3[3],uint32_t * const square_sum3[3],const ptrdiff_t sum_width,uint16_t * ma343,uint16_t * ma444,uint32_t * b343,uint32_t * b444)1753 LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
1754 const uint8_t* const src, const int width, const uint32_t scale,
1755 uint16_t* const sum3[3], uint32_t* const square_sum3[3],
1756 const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
1757 uint32_t* b444) {
1758 __m128i s[2], mas[2], sq[4], bs[3];
1759 s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass2 - width);
1760 sq[0] = SquareLo8(s[0]);
1761 BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
1762
1763 int x = 0;
1764 do {
1765 s[1] = LoadUnaligned16Msan(src + x + 16,
1766 x + 16 + kOverreadInBytesPass2 - width);
1767 BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
1768 bs);
1769 __m128i ma3[3];
1770 Prepare3_8<0>(mas, ma3);
1771 if (calculate444) { // NOLINT(readability-simplify-boolean-expr)
1772 Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
1773 Store343_444Hi(ma3, bs + 1, 8, ma343, ma444, b343, b444);
1774 ma444 += 16;
1775 b444 += 16;
1776 } else {
1777 __m128i ma[2], b[4];
1778 ma[0] = Sum343Lo(ma3);
1779 ma[1] = Sum343Hi(ma3);
1780 StoreAligned32U16(ma343, ma);
1781 Sum343W(bs + 0, b + 0);
1782 Sum343W(bs + 1, b + 2);
1783 StoreAligned64U32(b343, b);
1784 }
1785 s[0] = s[1];
1786 sq[1] = sq[3];
1787 mas[0] = mas[1];
1788 bs[0] = bs[2];
1789 ma343 += 16;
1790 b343 += 16;
1791 x += 16;
1792 } while (x < width);
1793 }
1794
BoxSumFilterPreProcess(const uint8_t * const src0,const uint8_t * const src1,const int width,const uint16_t scales[2],uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,uint16_t * const ma343[4],uint16_t * const ma444,uint16_t * ma565,uint32_t * const b343[4],uint32_t * const b444,uint32_t * b565)1795 inline void BoxSumFilterPreProcess(
1796 const uint8_t* const src0, const uint8_t* const src1, const int width,
1797 const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
1798 uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
1799 const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
1800 uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
1801 uint32_t* b565) {
1802 __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
1803 s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
1804 s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
1805 sq[0][0] = SquareLo8(s[0][0]);
1806 sq[1][0] = SquareLo8(s[1][0]);
1807 BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
1808 ma3, b3, &ma5[0], &b5[0]);
1809
1810 int x = 0;
1811 do {
1812 __m128i ma[2], b[4], ma3x[3], ma5x[3];
1813 s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
1814 x + 16 + kOverreadInBytesPass1 - width);
1815 s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
1816 x + 16 + kOverreadInBytesPass1 - width);
1817 BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
1818 sum_width, sq, ma3, b3, ma5, b5);
1819
1820 Prepare3_8<0>(ma3[0], ma3x);
1821 ma[0] = Sum343Lo(ma3x);
1822 ma[1] = Sum343Hi(ma3x);
1823 StoreAligned32U16(ma343[0] + x, ma);
1824 Sum343W(b3[0] + 0, b + 0);
1825 Sum343W(b3[0] + 1, b + 2);
1826 StoreAligned64U32(b343[0] + x, b);
1827 Sum565W(b5 + 0, b + 0);
1828 Sum565W(b5 + 1, b + 2);
1829 StoreAligned64U32(b565, b);
1830 Prepare3_8<0>(ma3[1], ma3x);
1831 Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
1832 Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
1833 Prepare3_8<0>(ma5, ma5x);
1834 ma[0] = Sum565Lo(ma5x);
1835 ma[1] = Sum565Hi(ma5x);
1836 StoreAligned32U16(ma565, ma);
1837 s[0][0] = s[0][1];
1838 s[1][0] = s[1][1];
1839 sq[0][1] = sq[0][3];
1840 sq[1][1] = sq[1][3];
1841 ma3[0][0] = ma3[0][1];
1842 ma3[1][0] = ma3[1][1];
1843 ma5[0] = ma5[1];
1844 b3[0][0] = b3[0][2];
1845 b3[1][0] = b3[1][2];
1846 b5[0] = b5[2];
1847 ma565 += 16;
1848 b565 += 16;
1849 x += 16;
1850 } while (x < width);
1851 }
1852
1853 template <int shift>
FilterOutput(const __m128i ma_x_src,const __m128i b)1854 inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) {
1855 // ma: 255 * 32 = 8160 (13 bits)
1856 // b: 65088 * 32 = 2082816 (21 bits)
1857 // v: b - ma * 255 (22 bits)
1858 const __m128i v = _mm_sub_epi32(b, ma_x_src);
1859 // kSgrProjSgrBits = 8
1860 // kSgrProjRestoreBits = 4
1861 // shift = 4 or 5
1862 // v >> 8 or 9 (13 bits)
1863 return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
1864 }
1865
1866 template <int shift>
CalculateFilteredOutput(const __m128i src,const __m128i ma,const __m128i b[2])1867 inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
1868 const __m128i b[2]) {
1869 const __m128i ma_x_src_lo = VmullLo16(ma, src);
1870 const __m128i ma_x_src_hi = VmullHi16(ma, src);
1871 const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
1872 const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
1873 return _mm_packs_epi32(dst_lo, dst_hi); // 13 bits
1874 }
1875
CalculateFilteredOutputPass1(const __m128i src,const __m128i ma[2],const __m128i b[2][2])1876 inline __m128i CalculateFilteredOutputPass1(const __m128i src,
1877 const __m128i ma[2],
1878 const __m128i b[2][2]) {
1879 const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
1880 __m128i b_sum[2];
1881 b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
1882 b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
1883 return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
1884 }
1885
CalculateFilteredOutputPass2(const __m128i src,const __m128i ma[3],const __m128i b[3][2])1886 inline __m128i CalculateFilteredOutputPass2(const __m128i src,
1887 const __m128i ma[3],
1888 const __m128i b[3][2]) {
1889 const __m128i ma_sum = Sum3_16(ma);
1890 __m128i b_sum[2];
1891 Sum3_32(b, b_sum);
1892 return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
1893 }
1894
SelfGuidedFinal(const __m128i src,const __m128i v[2])1895 inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
1896 const __m128i v_lo =
1897 VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
1898 const __m128i v_hi =
1899 VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
1900 const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
1901 return _mm_add_epi16(src, vv);
1902 }
1903
SelfGuidedDoubleMultiplier(const __m128i src,const __m128i filter[2],const int w0,const int w2)1904 inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
1905 const __m128i filter[2], const int w0,
1906 const int w2) {
1907 __m128i v[2];
1908 const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
1909 const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
1910 const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
1911 v[0] = _mm_madd_epi16(w0_w2, f_lo);
1912 v[1] = _mm_madd_epi16(w0_w2, f_hi);
1913 return SelfGuidedFinal(src, v);
1914 }
1915
SelfGuidedSingleMultiplier(const __m128i src,const __m128i filter,const int w0)1916 inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
1917 const __m128i filter, const int w0) {
1918 // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
1919 __m128i v[2];
1920 v[0] = VmullNLo8(filter, w0);
1921 v[1] = VmullNHi8(filter, w0);
1922 return SelfGuidedFinal(src, v);
1923 }
1924
BoxFilterPass1(const uint8_t * const src,const uint8_t * const src0,const uint8_t * const src1,const ptrdiff_t stride,uint16_t * const sum5[5],uint32_t * const square_sum5[5],const int width,const ptrdiff_t sum_width,const uint32_t scale,const int16_t w0,uint16_t * const ma565[2],uint32_t * const b565[2],uint8_t * const dst)1925 LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
1926 const uint8_t* const src, const uint8_t* const src0,
1927 const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
1928 uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
1929 const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
1930 uint32_t* const b565[2], uint8_t* const dst) {
1931 __m128i s[2][2], mas[2], sq[2][4], bs[3];
1932 s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
1933 s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
1934 sq[0][0] = SquareLo8(s[0][0]);
1935 sq[1][0] = SquareLo8(s[1][0]);
1936 BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
1937
1938 int x = 0;
1939 do {
1940 __m128i ma[2], ma5[3], b[2][2], sr[2], p[2];
1941 s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
1942 x + 16 + kOverreadInBytesPass1 - width);
1943 s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
1944 x + 16 + kOverreadInBytesPass1 - width);
1945 BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
1946 bs);
1947 Prepare3_8<0>(mas, ma5);
1948 ma[1] = Sum565Lo(ma5);
1949 StoreAligned16(ma565[1] + x, ma[1]);
1950 Sum565W(bs, b[1]);
1951 StoreAligned32U32(b565[1] + x, b[1]);
1952 sr[0] = LoadAligned16(src + x);
1953 sr[1] = LoadAligned16(src + stride + x);
1954 const __m128i sr0_lo = _mm_unpacklo_epi8(sr[0], _mm_setzero_si128());
1955 const __m128i sr1_lo = _mm_unpacklo_epi8(sr[1], _mm_setzero_si128());
1956 ma[0] = LoadAligned16(ma565[0] + x);
1957 LoadAligned32U32(b565[0] + x, b[0]);
1958 p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
1959 p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
1960 const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
1961 const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
1962
1963 ma[1] = Sum565Hi(ma5);
1964 StoreAligned16(ma565[1] + x + 8, ma[1]);
1965 Sum565W(bs + 1, b[1]);
1966 StoreAligned32U32(b565[1] + x + 8, b[1]);
1967 const __m128i sr0_hi = _mm_unpackhi_epi8(sr[0], _mm_setzero_si128());
1968 const __m128i sr1_hi = _mm_unpackhi_epi8(sr[1], _mm_setzero_si128());
1969 ma[0] = LoadAligned16(ma565[0] + x + 8);
1970 LoadAligned32U32(b565[0] + x + 8, b[0]);
1971 p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
1972 p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
1973 const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
1974 StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
1975 const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
1976 StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
1977 s[0][0] = s[0][1];
1978 s[1][0] = s[1][1];
1979 sq[0][1] = sq[0][3];
1980 sq[1][1] = sq[1][3];
1981 mas[0] = mas[1];
1982 bs[0] = bs[2];
1983 x += 16;
1984 } while (x < width);
1985 }
1986
BoxFilterPass1LastRow(const uint8_t * const src,const uint8_t * const src0,const int width,const ptrdiff_t sum_width,const uint32_t scale,const int16_t w0,uint16_t * const sum5[5],uint32_t * const square_sum5[5],uint16_t * ma565,uint32_t * b565,uint8_t * const dst)1987 inline void BoxFilterPass1LastRow(
1988 const uint8_t* const src, const uint8_t* const src0, const int width,
1989 const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
1990 uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
1991 uint32_t* b565, uint8_t* const dst) {
1992 __m128i s[2], mas[2], sq[4], bs[3];
1993 s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
1994 sq[0] = SquareLo8(s[0]);
1995 BoxFilterPreProcess5LastRowLo(s[0], scale, sum5, square_sum5, sq, &mas[0],
1996 &bs[0]);
1997
1998 int x = 0;
1999 do {
2000 __m128i ma[2], ma5[3], b[2][2];
2001 s[1] = LoadUnaligned16Msan(src0 + x + 16,
2002 x + 16 + kOverreadInBytesPass1 - width);
2003 BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
2004 sq, mas, bs);
2005 Prepare3_8<0>(mas, ma5);
2006 ma[1] = Sum565Lo(ma5);
2007 Sum565W(bs, b[1]);
2008 ma[0] = LoadAligned16(ma565);
2009 LoadAligned32U32(b565, b[0]);
2010 const __m128i sr = LoadAligned16(src + x);
2011 const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
2012 __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
2013 const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
2014
2015 ma[1] = Sum565Hi(ma5);
2016 Sum565W(bs + 1, b[1]);
2017 ma[0] = LoadAligned16(ma565 + 8);
2018 LoadAligned32U32(b565 + 8, b[0]);
2019 const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
2020 p = CalculateFilteredOutputPass1(sr_hi, ma, b);
2021 const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
2022 StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
2023 s[0] = s[1];
2024 sq[1] = sq[3];
2025 mas[0] = mas[1];
2026 bs[0] = bs[2];
2027 ma565 += 16;
2028 b565 += 16;
2029 x += 16;
2030 } while (x < width);
2031 }
2032
BoxFilterPass2(const uint8_t * const src,const uint8_t * const src0,const int width,const ptrdiff_t sum_width,const uint32_t scale,const int16_t w0,uint16_t * const sum3[3],uint32_t * const square_sum3[3],uint16_t * const ma343[3],uint16_t * const ma444[2],uint32_t * const b343[3],uint32_t * const b444[2],uint8_t * const dst)2033 LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
2034 const uint8_t* const src, const uint8_t* const src0, const int width,
2035 const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
2036 uint16_t* const sum3[3], uint32_t* const square_sum3[3],
2037 uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
2038 uint32_t* const b444[2], uint8_t* const dst) {
2039 __m128i s[2], mas[2], sq[4], bs[3];
2040 s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass2 - width);
2041 sq[0] = SquareLo8(s[0]);
2042 BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
2043
2044 int x = 0;
2045 do {
2046 s[1] = LoadUnaligned16Msan(src0 + x + 16,
2047 x + 16 + kOverreadInBytesPass2 - width);
2048 BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
2049 bs);
2050 __m128i ma[3], b[3][2], ma3[3];
2051 Prepare3_8<0>(mas, ma3);
2052 Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
2053 b444[1]);
2054 const __m128i sr = LoadAligned16(src + x);
2055 const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
2056 ma[0] = LoadAligned16(ma343[0] + x);
2057 ma[1] = LoadAligned16(ma444[0] + x);
2058 LoadAligned32U32(b343[0] + x, b[0]);
2059 LoadAligned32U32(b444[0] + x, b[1]);
2060 const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
2061
2062 Store343_444Hi(ma3, bs + 1, x + 8, &ma[2], b[2], ma343[2], ma444[1],
2063 b343[2], b444[1]);
2064 const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
2065 ma[0] = LoadAligned16(ma343[0] + x + 8);
2066 ma[1] = LoadAligned16(ma444[0] + x + 8);
2067 LoadAligned32U32(b343[0] + x + 8, b[0]);
2068 LoadAligned32U32(b444[0] + x + 8, b[1]);
2069 const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
2070 const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
2071 const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
2072 StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
2073 s[0] = s[1];
2074 sq[1] = sq[3];
2075 mas[0] = mas[1];
2076 bs[0] = bs[2];
2077 x += 16;
2078 } while (x < width);
2079 }
2080
BoxFilter(const uint8_t * const src,const uint8_t * const src0,const uint8_t * const src1,const ptrdiff_t stride,const int width,const uint16_t scales[2],const int16_t w0,const int16_t w2,uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,uint16_t * const ma343[4],uint16_t * const ma444[3],uint16_t * const ma565[2],uint32_t * const b343[4],uint32_t * const b444[3],uint32_t * const b565[2],uint8_t * const dst)2081 LIBGAV1_ALWAYS_INLINE void BoxFilter(
2082 const uint8_t* const src, const uint8_t* const src0,
2083 const uint8_t* const src1, const ptrdiff_t stride, const int width,
2084 const uint16_t scales[2], const int16_t w0, const int16_t w2,
2085 uint16_t* const sum3[4], uint16_t* const sum5[5],
2086 uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
2087 const ptrdiff_t sum_width, uint16_t* const ma343[4],
2088 uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
2089 uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
2090 __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
2091 s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
2092 s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
2093 sq[0][0] = SquareLo8(s[0][0]);
2094 sq[1][0] = SquareLo8(s[1][0]);
2095 BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
2096 ma3, b3, &ma5[0], &b5[0]);
2097
2098 int x = 0;
2099 do {
2100 __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
2101 s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
2102 x + 16 + kOverreadInBytesPass1 - width);
2103 s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
2104 x + 16 + kOverreadInBytesPass1 - width);
2105 BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
2106 sum_width, sq, ma3, b3, ma5, b5);
2107 Prepare3_8<0>(ma3[0], ma3x[0]);
2108 Prepare3_8<0>(ma3[1], ma3x[1]);
2109 Prepare3_8<0>(ma5, ma5x);
2110 Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
2111 ma343[2], ma444[1], b343[2], b444[1]);
2112 Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
2113 b343[3], b444[2]);
2114 ma[0][1] = Sum565Lo(ma5x);
2115 StoreAligned16(ma565[1] + x, ma[0][1]);
2116 Sum565W(b5, b[0][1]);
2117 StoreAligned32U32(b565[1] + x, b[0][1]);
2118 const __m128i sr0 = LoadAligned16(src + x);
2119 const __m128i sr1 = LoadAligned16(src + stride + x);
2120 const __m128i sr0_lo = _mm_unpacklo_epi8(sr0, _mm_setzero_si128());
2121 const __m128i sr1_lo = _mm_unpacklo_epi8(sr1, _mm_setzero_si128());
2122 ma[0][0] = LoadAligned16(ma565[0] + x);
2123 LoadAligned32U32(b565[0] + x, b[0][0]);
2124 p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
2125 p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
2126 ma[1][0] = LoadAligned16(ma343[0] + x);
2127 ma[1][1] = LoadAligned16(ma444[0] + x);
2128 LoadAligned32U32(b343[0] + x, b[1][0]);
2129 LoadAligned32U32(b444[0] + x, b[1][1]);
2130 p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
2131 const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
2132 ma[2][0] = LoadAligned16(ma343[1] + x);
2133 LoadAligned32U32(b343[1] + x, b[2][0]);
2134 p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
2135 const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
2136
2137 Store343_444Hi(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], b[1][2],
2138 b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
2139 Store343_444Hi(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], b[2][2], ma343[3],
2140 ma444[2], b343[3], b444[2]);
2141 ma[0][1] = Sum565Hi(ma5x);
2142 StoreAligned16(ma565[1] + x + 8, ma[0][1]);
2143 Sum565W(b5 + 1, b[0][1]);
2144 StoreAligned32U32(b565[1] + x + 8, b[0][1]);
2145 const __m128i sr0_hi = _mm_unpackhi_epi8(sr0, _mm_setzero_si128());
2146 const __m128i sr1_hi = _mm_unpackhi_epi8(sr1, _mm_setzero_si128());
2147 ma[0][0] = LoadAligned16(ma565[0] + x + 8);
2148 LoadAligned32U32(b565[0] + x + 8, b[0][0]);
2149 p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
2150 p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
2151 ma[1][0] = LoadAligned16(ma343[0] + x + 8);
2152 ma[1][1] = LoadAligned16(ma444[0] + x + 8);
2153 LoadAligned32U32(b343[0] + x + 8, b[1][0]);
2154 LoadAligned32U32(b444[0] + x + 8, b[1][1]);
2155 p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
2156 const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
2157 StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
2158 ma[2][0] = LoadAligned16(ma343[1] + x + 8);
2159 LoadAligned32U32(b343[1] + x + 8, b[2][0]);
2160 p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
2161 const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
2162 StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
2163 s[0][0] = s[0][1];
2164 s[1][0] = s[1][1];
2165 sq[0][1] = sq[0][3];
2166 sq[1][1] = sq[1][3];
2167 ma3[0][0] = ma3[0][1];
2168 ma3[1][0] = ma3[1][1];
2169 ma5[0] = ma5[1];
2170 b3[0][0] = b3[0][2];
2171 b3[1][0] = b3[1][2];
2172 b5[0] = b5[2];
2173 x += 16;
2174 } while (x < width);
2175 }
2176
BoxFilterLastRow(const uint8_t * const src,const uint8_t * const src0,const int width,const ptrdiff_t sum_width,const uint16_t scales[2],const int16_t w0,const int16_t w2,uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],uint16_t * const ma343,uint16_t * const ma444,uint16_t * const ma565,uint32_t * const b343,uint32_t * const b444,uint32_t * const b565,uint8_t * const dst)2177 inline void BoxFilterLastRow(
2178 const uint8_t* const src, const uint8_t* const src0, const int width,
2179 const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
2180 const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
2181 uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
2182 uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
2183 uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
2184 uint8_t* const dst) {
2185 __m128i s[2], ma3[2], ma5[2], sq[4], b3[3], b5[3], ma[3], b[3][2];
2186 s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
2187 sq[0] = SquareLo8(s[0]);
2188 BoxFilterPreProcessLastRowLo(s[0], scales, sum3, sum5, square_sum3,
2189 square_sum5, sq, &ma3[0], &ma5[0], &b3[0],
2190 &b5[0]);
2191
2192 int x = 0;
2193 do {
2194 __m128i ma3x[3], ma5x[3], p[2];
2195 s[1] = LoadUnaligned16Msan(src0 + x + 16,
2196 x + 16 + kOverreadInBytesPass1 - width);
2197 BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
2198 square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
2199 Prepare3_8<0>(ma3, ma3x);
2200 Prepare3_8<0>(ma5, ma5x);
2201 ma[1] = Sum565Lo(ma5x);
2202 Sum565W(b5, b[1]);
2203 ma[2] = Sum343Lo(ma3x);
2204 Sum343W(b3, b[2]);
2205 const __m128i sr = LoadAligned16(src + x);
2206 const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
2207 ma[0] = LoadAligned16(ma565 + x);
2208 LoadAligned32U32(b565 + x, b[0]);
2209 p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
2210 ma[0] = LoadAligned16(ma343 + x);
2211 ma[1] = LoadAligned16(ma444 + x);
2212 LoadAligned32U32(b343 + x, b[0]);
2213 LoadAligned32U32(b444 + x, b[1]);
2214 p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
2215 const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
2216
2217 ma[1] = Sum565Hi(ma5x);
2218 Sum565W(b5 + 1, b[1]);
2219 ma[2] = Sum343Hi(ma3x);
2220 Sum343W(b3 + 1, b[2]);
2221 const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
2222 ma[0] = LoadAligned16(ma565 + x + 8);
2223 LoadAligned32U32(b565 + x + 8, b[0]);
2224 p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
2225 ma[0] = LoadAligned16(ma343 + x + 8);
2226 ma[1] = LoadAligned16(ma444 + x + 8);
2227 LoadAligned32U32(b343 + x + 8, b[0]);
2228 LoadAligned32U32(b444 + x + 8, b[1]);
2229 p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
2230 const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
2231 StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
2232 s[0] = s[1];
2233 sq[1] = sq[3];
2234 ma3[0] = ma3[1];
2235 ma5[0] = ma5[1];
2236 b3[0] = b3[2];
2237 b5[0] = b5[2];
2238 x += 16;
2239 } while (x < width);
2240 }
2241
BoxFilterProcess(const RestorationUnitInfo & restoration_info,const uint8_t * src,const ptrdiff_t stride,const uint8_t * const top_border,const ptrdiff_t top_border_stride,const uint8_t * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,uint8_t * dst)2242 LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
2243 const RestorationUnitInfo& restoration_info, const uint8_t* src,
2244 const ptrdiff_t stride, const uint8_t* const top_border,
2245 const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
2246 const ptrdiff_t bottom_border_stride, const int width, const int height,
2247 SgrBuffer* const sgr_buffer, uint8_t* dst) {
2248 const auto temp_stride = Align<ptrdiff_t>(width, 16);
2249 const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
2250 const auto sum_stride = temp_stride + 16;
2251 const int sgr_proj_index = restoration_info.sgr_proj_info.index;
2252 const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
2253 const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
2254 const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
2255 const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
2256 uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
2257 uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
2258 sum3[0] = sgr_buffer->sum3;
2259 square_sum3[0] = sgr_buffer->square_sum3;
2260 ma343[0] = sgr_buffer->ma343;
2261 b343[0] = sgr_buffer->b343;
2262 for (int i = 1; i <= 3; ++i) {
2263 sum3[i] = sum3[i - 1] + sum_stride;
2264 square_sum3[i] = square_sum3[i - 1] + sum_stride;
2265 ma343[i] = ma343[i - 1] + temp_stride;
2266 b343[i] = b343[i - 1] + temp_stride;
2267 }
2268 sum5[0] = sgr_buffer->sum5;
2269 square_sum5[0] = sgr_buffer->square_sum5;
2270 for (int i = 1; i <= 4; ++i) {
2271 sum5[i] = sum5[i - 1] + sum_stride;
2272 square_sum5[i] = square_sum5[i - 1] + sum_stride;
2273 }
2274 ma444[0] = sgr_buffer->ma444;
2275 b444[0] = sgr_buffer->b444;
2276 for (int i = 1; i <= 2; ++i) {
2277 ma444[i] = ma444[i - 1] + temp_stride;
2278 b444[i] = b444[i - 1] + temp_stride;
2279 }
2280 ma565[0] = sgr_buffer->ma565;
2281 ma565[1] = ma565[0] + temp_stride;
2282 b565[0] = sgr_buffer->b565;
2283 b565[1] = b565[0] + temp_stride;
2284 assert(scales[0] != 0);
2285 assert(scales[1] != 0);
2286 BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
2287 sum5[1], square_sum3[0], square_sum5[1]);
2288 sum5[0] = sum5[1];
2289 square_sum5[0] = square_sum5[1];
2290 const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
2291 BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
2292 square_sum5, sum_width, ma343, ma444[0], ma565[0],
2293 b343, b444[0], b565[0]);
2294 sum5[0] = sgr_buffer->sum5;
2295 square_sum5[0] = sgr_buffer->square_sum5;
2296
2297 for (int y = (height >> 1) - 1; y > 0; --y) {
2298 Circulate4PointersBy2<uint16_t>(sum3);
2299 Circulate4PointersBy2<uint32_t>(square_sum3);
2300 Circulate5PointersBy2<uint16_t>(sum5);
2301 Circulate5PointersBy2<uint32_t>(square_sum5);
2302 BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
2303 scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
2304 ma343, ma444, ma565, b343, b444, b565, dst);
2305 src += 2 * stride;
2306 dst += 2 * stride;
2307 Circulate4PointersBy2<uint16_t>(ma343);
2308 Circulate4PointersBy2<uint32_t>(b343);
2309 std::swap(ma444[0], ma444[2]);
2310 std::swap(b444[0], b444[2]);
2311 std::swap(ma565[0], ma565[1]);
2312 std::swap(b565[0], b565[1]);
2313 }
2314
2315 Circulate4PointersBy2<uint16_t>(sum3);
2316 Circulate4PointersBy2<uint32_t>(square_sum3);
2317 Circulate5PointersBy2<uint16_t>(sum5);
2318 Circulate5PointersBy2<uint32_t>(square_sum5);
2319 if ((height & 1) == 0 || height > 1) {
2320 const uint8_t* sr[2];
2321 if ((height & 1) == 0) {
2322 sr[0] = bottom_border;
2323 sr[1] = bottom_border + bottom_border_stride;
2324 } else {
2325 sr[0] = src + 2 * stride;
2326 sr[1] = bottom_border;
2327 }
2328 BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
2329 square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
2330 b444, b565, dst);
2331 }
2332 if ((height & 1) != 0) {
2333 if (height > 1) {
2334 src += 2 * stride;
2335 dst += 2 * stride;
2336 Circulate4PointersBy2<uint16_t>(sum3);
2337 Circulate4PointersBy2<uint32_t>(square_sum3);
2338 Circulate5PointersBy2<uint16_t>(sum5);
2339 Circulate5PointersBy2<uint32_t>(square_sum5);
2340 Circulate4PointersBy2<uint16_t>(ma343);
2341 Circulate4PointersBy2<uint32_t>(b343);
2342 std::swap(ma444[0], ma444[2]);
2343 std::swap(b444[0], b444[2]);
2344 std::swap(ma565[0], ma565[1]);
2345 std::swap(b565[0], b565[1]);
2346 }
2347 BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
2348 sum_width, scales, w0, w2, sum3, sum5, square_sum3,
2349 square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
2350 b444[0], b565[0], dst);
2351 }
2352 }
2353
BoxFilterProcessPass1(const RestorationUnitInfo & restoration_info,const uint8_t * src,const ptrdiff_t stride,const uint8_t * const top_border,const ptrdiff_t top_border_stride,const uint8_t * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,uint8_t * dst)2354 inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
2355 const uint8_t* src, const ptrdiff_t stride,
2356 const uint8_t* const top_border,
2357 const ptrdiff_t top_border_stride,
2358 const uint8_t* bottom_border,
2359 const ptrdiff_t bottom_border_stride,
2360 const int width, const int height,
2361 SgrBuffer* const sgr_buffer, uint8_t* dst) {
2362 const auto temp_stride = Align<ptrdiff_t>(width, 16);
2363 const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
2364 const auto sum_stride = temp_stride + 16;
2365 const int sgr_proj_index = restoration_info.sgr_proj_info.index;
2366 const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
2367 const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
2368 uint16_t *sum5[5], *ma565[2];
2369 uint32_t *square_sum5[5], *b565[2];
2370 sum5[0] = sgr_buffer->sum5;
2371 square_sum5[0] = sgr_buffer->square_sum5;
2372 for (int i = 1; i <= 4; ++i) {
2373 sum5[i] = sum5[i - 1] + sum_stride;
2374 square_sum5[i] = square_sum5[i - 1] + sum_stride;
2375 }
2376 ma565[0] = sgr_buffer->ma565;
2377 ma565[1] = ma565[0] + temp_stride;
2378 b565[0] = sgr_buffer->b565;
2379 b565[1] = b565[0] + temp_stride;
2380 assert(scale != 0);
2381 BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
2382 sum5[1], square_sum5[1]);
2383 sum5[0] = sum5[1];
2384 square_sum5[0] = square_sum5[1];
2385 const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
2386 BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
2387 ma565[0], b565[0]);
2388 sum5[0] = sgr_buffer->sum5;
2389 square_sum5[0] = sgr_buffer->square_sum5;
2390
2391 for (int y = (height >> 1) - 1; y > 0; --y) {
2392 Circulate5PointersBy2<uint16_t>(sum5);
2393 Circulate5PointersBy2<uint32_t>(square_sum5);
2394 BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
2395 square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
2396 src += 2 * stride;
2397 dst += 2 * stride;
2398 std::swap(ma565[0], ma565[1]);
2399 std::swap(b565[0], b565[1]);
2400 }
2401
2402 Circulate5PointersBy2<uint16_t>(sum5);
2403 Circulate5PointersBy2<uint32_t>(square_sum5);
2404 if ((height & 1) == 0 || height > 1) {
2405 const uint8_t* sr[2];
2406 if ((height & 1) == 0) {
2407 sr[0] = bottom_border;
2408 sr[1] = bottom_border + bottom_border_stride;
2409 } else {
2410 sr[0] = src + 2 * stride;
2411 sr[1] = bottom_border;
2412 }
2413 BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
2414 sum_width, scale, w0, ma565, b565, dst);
2415 }
2416 if ((height & 1) != 0) {
2417 src += 3;
2418 if (height > 1) {
2419 src += 2 * stride;
2420 dst += 2 * stride;
2421 std::swap(ma565[0], ma565[1]);
2422 std::swap(b565[0], b565[1]);
2423 Circulate5PointersBy2<uint16_t>(sum5);
2424 Circulate5PointersBy2<uint32_t>(square_sum5);
2425 }
2426 BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
2427 sum_width, scale, w0, sum5, square_sum5, ma565[0],
2428 b565[0], dst);
2429 }
2430 }
2431
BoxFilterProcessPass2(const RestorationUnitInfo & restoration_info,const uint8_t * src,const ptrdiff_t stride,const uint8_t * const top_border,const ptrdiff_t top_border_stride,const uint8_t * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,uint8_t * dst)2432 inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
2433 const uint8_t* src, const ptrdiff_t stride,
2434 const uint8_t* const top_border,
2435 const ptrdiff_t top_border_stride,
2436 const uint8_t* bottom_border,
2437 const ptrdiff_t bottom_border_stride,
2438 const int width, const int height,
2439 SgrBuffer* const sgr_buffer, uint8_t* dst) {
2440 assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
2441 const auto temp_stride = Align<ptrdiff_t>(width, 16);
2442 const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
2443 const auto sum_stride = temp_stride + 16;
2444 const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
2445 const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
2446 const int sgr_proj_index = restoration_info.sgr_proj_info.index;
2447 const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
2448 uint16_t *sum3[3], *ma343[3], *ma444[2];
2449 uint32_t *square_sum3[3], *b343[3], *b444[2];
2450 sum3[0] = sgr_buffer->sum3;
2451 square_sum3[0] = sgr_buffer->square_sum3;
2452 ma343[0] = sgr_buffer->ma343;
2453 b343[0] = sgr_buffer->b343;
2454 for (int i = 1; i <= 2; ++i) {
2455 sum3[i] = sum3[i - 1] + sum_stride;
2456 square_sum3[i] = square_sum3[i - 1] + sum_stride;
2457 ma343[i] = ma343[i - 1] + temp_stride;
2458 b343[i] = b343[i - 1] + temp_stride;
2459 }
2460 ma444[0] = sgr_buffer->ma444;
2461 ma444[1] = ma444[0] + temp_stride;
2462 b444[0] = sgr_buffer->b444;
2463 b444[1] = b444[0] + temp_stride;
2464 assert(scale != 0);
2465 BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
2466 sum3[0], square_sum3[0]);
2467 BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
2468 sum_width, ma343[0], nullptr, b343[0],
2469 nullptr);
2470 Circulate3PointersBy1<uint16_t>(sum3);
2471 Circulate3PointersBy1<uint32_t>(square_sum3);
2472 const uint8_t* s;
2473 if (height > 1) {
2474 s = src + stride;
2475 } else {
2476 s = bottom_border;
2477 bottom_border += bottom_border_stride;
2478 }
2479 BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
2480 ma343[1], ma444[0], b343[1], b444[0]);
2481
2482 for (int y = height - 2; y > 0; --y) {
2483 Circulate3PointersBy1<uint16_t>(sum3);
2484 Circulate3PointersBy1<uint32_t>(square_sum3);
2485 BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
2486 square_sum3, ma343, ma444, b343, b444, dst);
2487 src += stride;
2488 dst += stride;
2489 Circulate3PointersBy1<uint16_t>(ma343);
2490 Circulate3PointersBy1<uint32_t>(b343);
2491 std::swap(ma444[0], ma444[1]);
2492 std::swap(b444[0], b444[1]);
2493 }
2494
2495 int y = std::min(height, 2);
2496 src += 2;
2497 do {
2498 Circulate3PointersBy1<uint16_t>(sum3);
2499 Circulate3PointersBy1<uint32_t>(square_sum3);
2500 BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
2501 square_sum3, ma343, ma444, b343, b444, dst);
2502 src += stride;
2503 dst += stride;
2504 bottom_border += bottom_border_stride;
2505 Circulate3PointersBy1<uint16_t>(ma343);
2506 Circulate3PointersBy1<uint32_t>(b343);
2507 std::swap(ma444[0], ma444[1]);
2508 std::swap(b444[0], b444[1]);
2509 } while (--y != 0);
2510 }
2511
2512 // If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
2513 // in the end of each row. It is safe to overwrite the output as it will not be
2514 // part of the visible frame.
SelfGuidedFilter_SSE4_1(const RestorationUnitInfo & LIBGAV1_RESTRICT restoration_info,const void * LIBGAV1_RESTRICT const source,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_border,const ptrdiff_t top_border_stride,const void * LIBGAV1_RESTRICT const bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,RestorationBuffer * LIBGAV1_RESTRICT const restoration_buffer,void * LIBGAV1_RESTRICT const dest)2515 void SelfGuidedFilter_SSE4_1(
2516 const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
2517 const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
2518 const void* LIBGAV1_RESTRICT const top_border,
2519 const ptrdiff_t top_border_stride,
2520 const void* LIBGAV1_RESTRICT const bottom_border,
2521 const ptrdiff_t bottom_border_stride, const int width, const int height,
2522 RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
2523 void* LIBGAV1_RESTRICT const dest) {
2524 const int index = restoration_info.sgr_proj_info.index;
2525 const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
2526 const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
2527 const auto* const src = static_cast<const uint8_t*>(source);
2528 const auto* top = static_cast<const uint8_t*>(top_border);
2529 const auto* bottom = static_cast<const uint8_t*>(bottom_border);
2530 auto* const dst = static_cast<uint8_t*>(dest);
2531 SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
2532 if (radius_pass_1 == 0) {
2533 // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
2534 // following assertion.
2535 assert(radius_pass_0 != 0);
2536 BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
2537 top_border_stride, bottom - 3, bottom_border_stride,
2538 width, height, sgr_buffer, dst);
2539 } else if (radius_pass_0 == 0) {
2540 BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
2541 top_border_stride, bottom - 2, bottom_border_stride,
2542 width, height, sgr_buffer, dst);
2543 } else {
2544 BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
2545 top_border_stride, bottom - 3, bottom_border_stride, width,
2546 height, sgr_buffer, dst);
2547 }
2548 }
2549
Init8bpp()2550 void Init8bpp() {
2551 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
2552 assert(dsp != nullptr);
2553 static_cast<void>(dsp);
2554 #if DSP_ENABLED_8BPP_SSE4_1(WienerFilter)
2555 dsp->loop_restorations[0] = WienerFilter_SSE4_1;
2556 #else
2557 static_cast<void>(WienerFilter_SSE4_1);
2558 #endif
2559 #if DSP_ENABLED_8BPP_SSE4_1(SelfGuidedFilter)
2560 dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
2561 #else
2562 static_cast<void>(SelfGuidedFilter_SSE4_1);
2563 #endif
2564 }
2565
2566 } // namespace
2567 } // namespace low_bitdepth
2568
LoopRestorationInit_SSE4_1()2569 void LoopRestorationInit_SSE4_1() { low_bitdepth::Init8bpp(); }
2570
2571 } // namespace dsp
2572 } // namespace libgav1
2573
2574 #else // !LIBGAV1_TARGETING_SSE4_1
2575 namespace libgav1 {
2576 namespace dsp {
2577
LoopRestorationInit_SSE4_1()2578 void LoopRestorationInit_SSE4_1() {}
2579
2580 } // namespace dsp
2581 } // namespace libgav1
2582 #endif // LIBGAV1_TARGETING_SSE4_1
2583