1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/loop_restoration.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_TARGETING_SSE4_1
19 #include <smmintrin.h>
20 
21 #include <algorithm>
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25 #include <cstring>
26 
27 #include "src/dsp/common.h"
28 #include "src/dsp/constants.h"
29 #include "src/dsp/dsp.h"
30 #include "src/dsp/x86/common_sse4.h"
31 #include "src/utils/common.h"
32 #include "src/utils/constants.h"
33 
34 namespace libgav1 {
35 namespace dsp {
36 namespace low_bitdepth {
37 namespace {
38 
WienerHorizontalClip(const __m128i s[2],const __m128i s_3x128,int16_t * const wiener_buffer)39 inline void WienerHorizontalClip(const __m128i s[2], const __m128i s_3x128,
40                                  int16_t* const wiener_buffer) {
41   constexpr int offset =
42       1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
43   constexpr int limit =
44       (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
45   const __m128i offsets = _mm_set1_epi16(-offset);
46   const __m128i limits = _mm_set1_epi16(limit - offset);
47   // The sum range here is [-128 * 255 + 4, 90 * 255 + 4].
48   const __m128i sum = _mm_add_epi16(s[0], s[1]);
49   const __m128i rounded_sum0 = _mm_srai_epi16(sum, kInterRoundBitsHorizontal);
50   // Add back scaled down offset correction.
51   const __m128i rounded_sum1 = _mm_add_epi16(rounded_sum0, s_3x128);
52   const __m128i d0 = _mm_max_epi16(rounded_sum1, offsets);
53   const __m128i d1 = _mm_min_epi16(d0, limits);
54   StoreAligned16(wiener_buffer, d1);
55 }
56 
WienerHorizontalTap7Kernel(const __m128i s[4],const __m128i filter[4],int16_t * const wiener_buffer)57 inline void WienerHorizontalTap7Kernel(const __m128i s[4],
58                                        const __m128i filter[4],
59                                        int16_t* const wiener_buffer) {
60   __m128i madds[4];
61   madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
62   madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
63   madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
64   madds[3] = _mm_maddubs_epi16(s[3], filter[3]);
65   madds[0] = _mm_add_epi16(madds[0], madds[2]);
66   madds[1] = _mm_add_epi16(madds[1], madds[3]);
67   const __m128i s_3x128 =
68       _mm_slli_epi16(_mm_srli_epi16(s[1], 8), 7 - kInterRoundBitsHorizontal);
69   WienerHorizontalClip(madds, s_3x128, wiener_buffer);
70 }
71 
WienerHorizontalTap5Kernel(const __m128i s[5],const __m128i filter[3],int16_t * const wiener_buffer)72 inline void WienerHorizontalTap5Kernel(const __m128i s[5],
73                                        const __m128i filter[3],
74                                        int16_t* const wiener_buffer) {
75   __m128i madds[3];
76   madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
77   madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
78   madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
79   madds[0] = _mm_add_epi16(madds[0], madds[2]);
80   const __m128i s_3x128 =
81       _mm_srli_epi16(_mm_slli_epi16(s[1], 8), kInterRoundBitsHorizontal + 1);
82   WienerHorizontalClip(madds, s_3x128, wiener_buffer);
83 }
84 
WienerHorizontalTap3Kernel(const __m128i s[2],const __m128i filter[2],int16_t * const wiener_buffer)85 inline void WienerHorizontalTap3Kernel(const __m128i s[2],
86                                        const __m128i filter[2],
87                                        int16_t* const wiener_buffer) {
88   __m128i madds[2];
89   madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
90   madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
91   const __m128i s_3x128 =
92       _mm_slli_epi16(_mm_srli_epi16(s[0], 8), 7 - kInterRoundBitsHorizontal);
93   WienerHorizontalClip(madds, s_3x128, wiener_buffer);
94 }
95 
96 // loading all and unpacking is about 7% faster than using _mm_alignr_epi8().
WienerHorizontalTap7(const uint8_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,const int coefficient0,const __m128i coefficients,int16_t ** const wiener_buffer)97 inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
98                                  const ptrdiff_t width, const int height,
99                                  const int coefficient0,
100                                  const __m128i coefficients,
101                                  int16_t** const wiener_buffer) {
102   const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
103   __m128i filter[4];
104   filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0200));
105   filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
106   filter[2] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0204));
107   filter[3] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient0));
108   for (int y = height; y != 0; --y) {
109     ptrdiff_t x = 0;
110     do {
111       __m128i s[7], ss[4];
112       s[0] = LoadUnaligned16(src + x + 0);
113       s[1] = LoadUnaligned16(src + x + 1);
114       s[2] = LoadUnaligned16(src + x + 2);
115       s[3] = LoadUnaligned16(src + x + 3);
116       s[4] = LoadUnaligned16(src + x + 4);
117       s[5] = LoadUnaligned16(src + x + 5);
118       s[6] = LoadUnaligned16(src + x + 6);
119       ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
120       ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
121       ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
122       ss[3] = _mm_unpacklo_epi8(s[6], round);
123       WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 0);
124       ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
125       ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
126       ss[2] = _mm_unpackhi_epi8(s[4], s[5]);
127       ss[3] = _mm_unpackhi_epi8(s[6], round);
128       WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 8);
129       x += 16;
130     } while (x < width);
131     src += src_stride;
132     *wiener_buffer += width;
133   }
134 }
135 
WienerHorizontalTap5(const uint8_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,const int coefficient1,const __m128i coefficients,int16_t ** const wiener_buffer)136 inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
137                                  const ptrdiff_t width, const int height,
138                                  const int coefficient1,
139                                  const __m128i coefficients,
140                                  int16_t** const wiener_buffer) {
141   const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
142   __m128i filter[3];
143   filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0402));
144   filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0406));
145   filter[2] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient1));
146   for (int y = height; y != 0; --y) {
147     ptrdiff_t x = 0;
148     do {
149       __m128i s[5], ss[3];
150       s[0] = LoadUnaligned16(src + x + 0);
151       s[1] = LoadUnaligned16(src + x + 1);
152       s[2] = LoadUnaligned16(src + x + 2);
153       s[3] = LoadUnaligned16(src + x + 3);
154       s[4] = LoadUnaligned16(src + x + 4);
155       ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
156       ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
157       ss[2] = _mm_unpacklo_epi8(s[4], round);
158       WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 0);
159       ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
160       ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
161       ss[2] = _mm_unpackhi_epi8(s[4], round);
162       WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 8);
163       x += 16;
164     } while (x < width);
165     src += src_stride;
166     *wiener_buffer += width;
167   }
168 }
169 
WienerHorizontalTap3(const uint8_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,const int coefficient2,const __m128i coefficients,int16_t ** const wiener_buffer)170 inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
171                                  const ptrdiff_t width, const int height,
172                                  const int coefficient2,
173                                  const __m128i coefficients,
174                                  int16_t** const wiener_buffer) {
175   const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
176   __m128i filter[2];
177   filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
178   filter[1] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient2));
179   for (int y = height; y != 0; --y) {
180     ptrdiff_t x = 0;
181     do {
182       __m128i s[3], ss[2];
183       s[0] = LoadUnaligned16(src + x + 0);
184       s[1] = LoadUnaligned16(src + x + 1);
185       s[2] = LoadUnaligned16(src + x + 2);
186       ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
187       ss[1] = _mm_unpacklo_epi8(s[2], round);
188       WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 0);
189       ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
190       ss[1] = _mm_unpackhi_epi8(s[2], round);
191       WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 8);
192       x += 16;
193     } while (x < width);
194     src += src_stride;
195     *wiener_buffer += width;
196   }
197 }
198 
WienerHorizontalTap1(const uint8_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,int16_t ** const wiener_buffer)199 inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
200                                  const ptrdiff_t width, const int height,
201                                  int16_t** const wiener_buffer) {
202   for (int y = height; y != 0; --y) {
203     ptrdiff_t x = 0;
204     do {
205       const __m128i s = LoadUnaligned16(src + x);
206       const __m128i s0 = _mm_unpacklo_epi8(s, _mm_setzero_si128());
207       const __m128i s1 = _mm_unpackhi_epi8(s, _mm_setzero_si128());
208       const __m128i d0 = _mm_slli_epi16(s0, 4);
209       const __m128i d1 = _mm_slli_epi16(s1, 4);
210       StoreAligned16(*wiener_buffer + x + 0, d0);
211       StoreAligned16(*wiener_buffer + x + 8, d1);
212       x += 16;
213     } while (x < width);
214     src += src_stride;
215     *wiener_buffer += width;
216   }
217 }
218 
WienerVertical7(const __m128i a[2],const __m128i filter[2])219 inline __m128i WienerVertical7(const __m128i a[2], const __m128i filter[2]) {
220   const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1));
221   const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
222   const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
223   const __m128i sum0 = _mm_add_epi32(round, madd0);
224   const __m128i sum1 = _mm_add_epi32(sum0, madd1);
225   return _mm_srai_epi32(sum1, kInterRoundBitsVertical);
226 }
227 
WienerVertical5(const __m128i a[2],const __m128i filter[2])228 inline __m128i WienerVertical5(const __m128i a[2], const __m128i filter[2]) {
229   const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
230   const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
231   const __m128i sum = _mm_add_epi32(madd0, madd1);
232   return _mm_srai_epi32(sum, kInterRoundBitsVertical);
233 }
234 
WienerVertical3(const __m128i a,const __m128i filter)235 inline __m128i WienerVertical3(const __m128i a, const __m128i filter) {
236   const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1));
237   const __m128i madd = _mm_madd_epi16(a, filter);
238   const __m128i sum = _mm_add_epi32(round, madd);
239   return _mm_srai_epi32(sum, kInterRoundBitsVertical);
240 }
241 
WienerVerticalFilter7(const __m128i a[7],const __m128i filter[2])242 inline __m128i WienerVerticalFilter7(const __m128i a[7],
243                                      const __m128i filter[2]) {
244   __m128i b[2];
245   const __m128i a06 = _mm_add_epi16(a[0], a[6]);
246   const __m128i a15 = _mm_add_epi16(a[1], a[5]);
247   const __m128i a24 = _mm_add_epi16(a[2], a[4]);
248   b[0] = _mm_unpacklo_epi16(a06, a15);
249   b[1] = _mm_unpacklo_epi16(a24, a[3]);
250   const __m128i sum0 = WienerVertical7(b, filter);
251   b[0] = _mm_unpackhi_epi16(a06, a15);
252   b[1] = _mm_unpackhi_epi16(a24, a[3]);
253   const __m128i sum1 = WienerVertical7(b, filter);
254   return _mm_packs_epi32(sum0, sum1);
255 }
256 
WienerVerticalFilter5(const __m128i a[5],const __m128i filter[2])257 inline __m128i WienerVerticalFilter5(const __m128i a[5],
258                                      const __m128i filter[2]) {
259   const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
260   __m128i b[2];
261   const __m128i a04 = _mm_add_epi16(a[0], a[4]);
262   const __m128i a13 = _mm_add_epi16(a[1], a[3]);
263   b[0] = _mm_unpacklo_epi16(a04, a13);
264   b[1] = _mm_unpacklo_epi16(a[2], round);
265   const __m128i sum0 = WienerVertical5(b, filter);
266   b[0] = _mm_unpackhi_epi16(a04, a13);
267   b[1] = _mm_unpackhi_epi16(a[2], round);
268   const __m128i sum1 = WienerVertical5(b, filter);
269   return _mm_packs_epi32(sum0, sum1);
270 }
271 
WienerVerticalFilter3(const __m128i a[3],const __m128i filter)272 inline __m128i WienerVerticalFilter3(const __m128i a[3], const __m128i filter) {
273   __m128i b;
274   const __m128i a02 = _mm_add_epi16(a[0], a[2]);
275   b = _mm_unpacklo_epi16(a02, a[1]);
276   const __m128i sum0 = WienerVertical3(b, filter);
277   b = _mm_unpackhi_epi16(a02, a[1]);
278   const __m128i sum1 = WienerVertical3(b, filter);
279   return _mm_packs_epi32(sum0, sum1);
280 }
281 
WienerVerticalTap7Kernel(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter[2],__m128i a[7])282 inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
283                                         const ptrdiff_t wiener_stride,
284                                         const __m128i filter[2], __m128i a[7]) {
285   a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
286   a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
287   a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
288   a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
289   a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
290   a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
291   a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
292   return WienerVerticalFilter7(a, filter);
293 }
294 
WienerVerticalTap5Kernel(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter[2],__m128i a[5])295 inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
296                                         const ptrdiff_t wiener_stride,
297                                         const __m128i filter[2], __m128i a[5]) {
298   a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
299   a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
300   a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
301   a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
302   a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
303   return WienerVerticalFilter5(a, filter);
304 }
305 
WienerVerticalTap3Kernel(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter,__m128i a[3])306 inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
307                                         const ptrdiff_t wiener_stride,
308                                         const __m128i filter, __m128i a[3]) {
309   a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
310   a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
311   a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
312   return WienerVerticalFilter3(a, filter);
313 }
314 
WienerVerticalTap7Kernel2(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter[2],__m128i d[2])315 inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
316                                       const ptrdiff_t wiener_stride,
317                                       const __m128i filter[2], __m128i d[2]) {
318   __m128i a[8];
319   d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
320   a[7] = LoadAligned16(wiener_buffer + 7 * wiener_stride);
321   d[1] = WienerVerticalFilter7(a + 1, filter);
322 }
323 
WienerVerticalTap5Kernel2(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter[2],__m128i d[2])324 inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
325                                       const ptrdiff_t wiener_stride,
326                                       const __m128i filter[2], __m128i d[2]) {
327   __m128i a[6];
328   d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
329   a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
330   d[1] = WienerVerticalFilter5(a + 1, filter);
331 }
332 
WienerVerticalTap3Kernel2(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter,__m128i d[2])333 inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
334                                       const ptrdiff_t wiener_stride,
335                                       const __m128i filter, __m128i d[2]) {
336   __m128i a[4];
337   d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
338   a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
339   d[1] = WienerVerticalFilter3(a + 1, filter);
340 }
341 
WienerVerticalTap7(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,const int16_t coefficients[4],uint8_t * dst,const ptrdiff_t dst_stride)342 inline void WienerVerticalTap7(const int16_t* wiener_buffer,
343                                const ptrdiff_t width, const int height,
344                                const int16_t coefficients[4], uint8_t* dst,
345                                const ptrdiff_t dst_stride) {
346   const __m128i c = LoadLo8(coefficients);
347   __m128i filter[2];
348   filter[0] = _mm_shuffle_epi32(c, 0x0);
349   filter[1] = _mm_shuffle_epi32(c, 0x55);
350   for (int y = height >> 1; y > 0; --y) {
351     ptrdiff_t x = 0;
352     do {
353       __m128i d[2][2];
354       WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
355       WienerVerticalTap7Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
356       StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
357       StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
358       x += 16;
359     } while (x < width);
360     dst += 2 * dst_stride;
361     wiener_buffer += 2 * width;
362   }
363 
364   if ((height & 1) != 0) {
365     ptrdiff_t x = 0;
366     do {
367       __m128i a[7];
368       const __m128i d0 =
369           WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a);
370       const __m128i d1 =
371           WienerVerticalTap7Kernel(wiener_buffer + x + 8, width, filter, a);
372       StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
373       x += 16;
374     } while (x < width);
375   }
376 }
377 
WienerVerticalTap5(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,const int16_t coefficients[3],uint8_t * dst,const ptrdiff_t dst_stride)378 inline void WienerVerticalTap5(const int16_t* wiener_buffer,
379                                const ptrdiff_t width, const int height,
380                                const int16_t coefficients[3], uint8_t* dst,
381                                const ptrdiff_t dst_stride) {
382   const __m128i c = Load4(coefficients);
383   __m128i filter[2];
384   filter[0] = _mm_shuffle_epi32(c, 0);
385   filter[1] =
386       _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[2]));
387   for (int y = height >> 1; y > 0; --y) {
388     ptrdiff_t x = 0;
389     do {
390       __m128i d[2][2];
391       WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
392       WienerVerticalTap5Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
393       StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
394       StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
395       x += 16;
396     } while (x < width);
397     dst += 2 * dst_stride;
398     wiener_buffer += 2 * width;
399   }
400 
401   if ((height & 1) != 0) {
402     ptrdiff_t x = 0;
403     do {
404       __m128i a[5];
405       const __m128i d0 =
406           WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a);
407       const __m128i d1 =
408           WienerVerticalTap5Kernel(wiener_buffer + x + 8, width, filter, a);
409       StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
410       x += 16;
411     } while (x < width);
412   }
413 }
414 
WienerVerticalTap3(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,const int16_t coefficients[2],uint8_t * dst,const ptrdiff_t dst_stride)415 inline void WienerVerticalTap3(const int16_t* wiener_buffer,
416                                const ptrdiff_t width, const int height,
417                                const int16_t coefficients[2], uint8_t* dst,
418                                const ptrdiff_t dst_stride) {
419   const __m128i filter =
420       _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
421   for (int y = height >> 1; y > 0; --y) {
422     ptrdiff_t x = 0;
423     do {
424       __m128i d[2][2];
425       WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
426       WienerVerticalTap3Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
427       StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
428       StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
429       x += 16;
430     } while (x < width);
431     dst += 2 * dst_stride;
432     wiener_buffer += 2 * width;
433   }
434 
435   if ((height & 1) != 0) {
436     ptrdiff_t x = 0;
437     do {
438       __m128i a[3];
439       const __m128i d0 =
440           WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a);
441       const __m128i d1 =
442           WienerVerticalTap3Kernel(wiener_buffer + x + 8, width, filter, a);
443       StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
444       x += 16;
445     } while (x < width);
446   }
447 }
448 
WienerVerticalTap1Kernel(const int16_t * const wiener_buffer,uint8_t * const dst)449 inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
450                                      uint8_t* const dst) {
451   const __m128i a0 = LoadAligned16(wiener_buffer + 0);
452   const __m128i a1 = LoadAligned16(wiener_buffer + 8);
453   const __m128i b0 = _mm_add_epi16(a0, _mm_set1_epi16(8));
454   const __m128i b1 = _mm_add_epi16(a1, _mm_set1_epi16(8));
455   const __m128i c0 = _mm_srai_epi16(b0, 4);
456   const __m128i c1 = _mm_srai_epi16(b1, 4);
457   const __m128i d = _mm_packus_epi16(c0, c1);
458   StoreAligned16(dst, d);
459 }
460 
WienerVerticalTap1(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,uint8_t * dst,const ptrdiff_t dst_stride)461 inline void WienerVerticalTap1(const int16_t* wiener_buffer,
462                                const ptrdiff_t width, const int height,
463                                uint8_t* dst, const ptrdiff_t dst_stride) {
464   for (int y = height >> 1; y > 0; --y) {
465     ptrdiff_t x = 0;
466     do {
467       WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
468       WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
469       x += 16;
470     } while (x < width);
471     dst += 2 * dst_stride;
472     wiener_buffer += 2 * width;
473   }
474 
475   if ((height & 1) != 0) {
476     ptrdiff_t x = 0;
477     do {
478       WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
479       x += 16;
480     } while (x < width);
481   }
482 }
483 
WienerFilter_SSE4_1(const RestorationUnitInfo & LIBGAV1_RESTRICT restoration_info,const void * LIBGAV1_RESTRICT const source,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_border,const ptrdiff_t top_border_stride,const void * LIBGAV1_RESTRICT const bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,RestorationBuffer * LIBGAV1_RESTRICT const restoration_buffer,void * LIBGAV1_RESTRICT const dest)484 void WienerFilter_SSE4_1(
485     const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
486     const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
487     const void* LIBGAV1_RESTRICT const top_border,
488     const ptrdiff_t top_border_stride,
489     const void* LIBGAV1_RESTRICT const bottom_border,
490     const ptrdiff_t bottom_border_stride, const int width, const int height,
491     RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
492     void* LIBGAV1_RESTRICT const dest) {
493   const int16_t* const number_leading_zero_coefficients =
494       restoration_info.wiener_info.number_leading_zero_coefficients;
495   const int number_rows_to_skip = std::max(
496       static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
497       1);
498   const ptrdiff_t wiener_stride = Align(width, 16);
499   int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
500   // The values are saturated to 13 bits before storing.
501   int16_t* wiener_buffer_horizontal =
502       wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
503 
504   // horizontal filtering.
505   // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
506   const int height_horizontal =
507       height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
508   const int height_extra = (height_horizontal - height) >> 1;
509   assert(height_extra <= 2);
510   const auto* const src = static_cast<const uint8_t*>(source);
511   const auto* const top = static_cast<const uint8_t*>(top_border);
512   const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
513   const int16_t* const filter_horizontal =
514       restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
515   const __m128i c = LoadLo8(filter_horizontal);
516   // In order to keep the horizontal pass intermediate values within 16 bits we
517   // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
518   const __m128i coefficients_horizontal =
519       _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
520   if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
521     WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
522                          top_border_stride, wiener_stride, height_extra,
523                          filter_horizontal[0], coefficients_horizontal,
524                          &wiener_buffer_horizontal);
525     WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
526                          filter_horizontal[0], coefficients_horizontal,
527                          &wiener_buffer_horizontal);
528     WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
529                          height_extra, filter_horizontal[0],
530                          coefficients_horizontal, &wiener_buffer_horizontal);
531   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
532     WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
533                          top_border_stride, wiener_stride, height_extra,
534                          filter_horizontal[1], coefficients_horizontal,
535                          &wiener_buffer_horizontal);
536     WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
537                          filter_horizontal[1], coefficients_horizontal,
538                          &wiener_buffer_horizontal);
539     WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
540                          height_extra, filter_horizontal[1],
541                          coefficients_horizontal, &wiener_buffer_horizontal);
542   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
543     // The maximum over-reads happen here.
544     WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
545                          top_border_stride, wiener_stride, height_extra,
546                          filter_horizontal[2], coefficients_horizontal,
547                          &wiener_buffer_horizontal);
548     WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
549                          filter_horizontal[2], coefficients_horizontal,
550                          &wiener_buffer_horizontal);
551     WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
552                          height_extra, filter_horizontal[2],
553                          coefficients_horizontal, &wiener_buffer_horizontal);
554   } else {
555     assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
556     WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
557                          top_border_stride, wiener_stride, height_extra,
558                          &wiener_buffer_horizontal);
559     WienerHorizontalTap1(src, stride, wiener_stride, height,
560                          &wiener_buffer_horizontal);
561     WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
562                          height_extra, &wiener_buffer_horizontal);
563   }
564 
565   // vertical filtering.
566   // Over-writes up to 15 values.
567   const int16_t* const filter_vertical =
568       restoration_info.wiener_info.filter[WienerInfo::kVertical];
569   auto* dst = static_cast<uint8_t*>(dest);
570   if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
571     // Because the top row of |source| is a duplicate of the second row, and the
572     // bottom row of |source| is a duplicate of its above row, we can duplicate
573     // the top and bottom row of |wiener_buffer| accordingly.
574     memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
575            sizeof(*wiener_buffer_horizontal) * wiener_stride);
576     memcpy(restoration_buffer->wiener_buffer,
577            restoration_buffer->wiener_buffer + wiener_stride,
578            sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
579     WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
580                        filter_vertical, dst, stride);
581   } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
582     WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
583                        height, filter_vertical + 1, dst, stride);
584   } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
585     WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
586                        wiener_stride, height, filter_vertical + 2, dst, stride);
587   } else {
588     assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
589     WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
590                        wiener_stride, height, dst, stride);
591   }
592 }
593 
594 //------------------------------------------------------------------------------
595 // SGR
596 
597 // SIMD overreads 16 - (width % 16) - 2 * padding pixels, where padding is 3 for
598 // Pass 1 and 2 for Pass 2.
599 constexpr int kOverreadInBytesPass1 = 10;
600 constexpr int kOverreadInBytesPass2 = 12;
601 
LoadAligned16x2U16(const uint16_t * const src[2],const ptrdiff_t x,__m128i dst[2])602 inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
603                                __m128i dst[2]) {
604   dst[0] = LoadAligned16(src[0] + x);
605   dst[1] = LoadAligned16(src[1] + x);
606 }
607 
LoadAligned16x2U16Msan(const uint16_t * const src[2],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[2])608 inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
609                                    const ptrdiff_t x, const ptrdiff_t border,
610                                    __m128i dst[2]) {
611   dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
612   dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
613 }
614 
LoadAligned16x3U16(const uint16_t * const src[3],const ptrdiff_t x,__m128i dst[3])615 inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
616                                __m128i dst[3]) {
617   dst[0] = LoadAligned16(src[0] + x);
618   dst[1] = LoadAligned16(src[1] + x);
619   dst[2] = LoadAligned16(src[2] + x);
620 }
621 
LoadAligned16x3U16Msan(const uint16_t * const src[3],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[3])622 inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
623                                    const ptrdiff_t x, const ptrdiff_t border,
624                                    __m128i dst[3]) {
625   dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
626   dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
627   dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
628 }
629 
LoadAligned32U32(const uint32_t * const src,__m128i dst[2])630 inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
631   dst[0] = LoadAligned16(src + 0);
632   dst[1] = LoadAligned16(src + 4);
633 }
634 
LoadAligned32U32Msan(const uint32_t * const src,const ptrdiff_t x,const ptrdiff_t border,__m128i dst[2])635 inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
636                                  const ptrdiff_t border, __m128i dst[2]) {
637   dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
638   dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
639 }
640 
LoadAligned32x2U32(const uint32_t * const src[2],const ptrdiff_t x,__m128i dst[2][2])641 inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
642                                __m128i dst[2][2]) {
643   LoadAligned32U32(src[0] + x, dst[0]);
644   LoadAligned32U32(src[1] + x, dst[1]);
645 }
646 
LoadAligned32x2U32Msan(const uint32_t * const src[2],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[2][2])647 inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
648                                    const ptrdiff_t x, const ptrdiff_t border,
649                                    __m128i dst[2][2]) {
650   LoadAligned32U32Msan(src[0], x, border, dst[0]);
651   LoadAligned32U32Msan(src[1], x, border, dst[1]);
652 }
653 
LoadAligned32x3U32(const uint32_t * const src[3],const ptrdiff_t x,__m128i dst[3][2])654 inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
655                                __m128i dst[3][2]) {
656   LoadAligned32U32(src[0] + x, dst[0]);
657   LoadAligned32U32(src[1] + x, dst[1]);
658   LoadAligned32U32(src[2] + x, dst[2]);
659 }
660 
LoadAligned32x3U32Msan(const uint32_t * const src[3],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[3][2])661 inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
662                                    const ptrdiff_t x, const ptrdiff_t border,
663                                    __m128i dst[3][2]) {
664   LoadAligned32U32Msan(src[0], x, border, dst[0]);
665   LoadAligned32U32Msan(src[1], x, border, dst[1]);
666   LoadAligned32U32Msan(src[2], x, border, dst[2]);
667 }
668 
StoreAligned32U16(uint16_t * const dst,const __m128i src[2])669 inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
670   StoreAligned16(dst + 0, src[0]);
671   StoreAligned16(dst + 8, src[1]);
672 }
673 
StoreAligned32U32(uint32_t * const dst,const __m128i src[2])674 inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
675   StoreAligned16(dst + 0, src[0]);
676   StoreAligned16(dst + 4, src[1]);
677 }
678 
StoreAligned64U32(uint32_t * const dst,const __m128i src[4])679 inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
680   StoreAligned32U32(dst + 0, src + 0);
681   StoreAligned32U32(dst + 8, src + 2);
682 }
683 
684 // Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
685 // functions. Some compilers may generate super inefficient code and the whole
686 // decoder could be 15% slower.
687 
VaddlLo8(const __m128i src0,const __m128i src1)688 inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
689   const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
690   const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
691   return _mm_add_epi16(s0, s1);
692 }
693 
VaddlHi8(const __m128i src0,const __m128i src1)694 inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) {
695   const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
696   const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
697   return _mm_add_epi16(s0, s1);
698 }
699 
VaddlLo16(const __m128i src0,const __m128i src1)700 inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) {
701   const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
702   const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
703   return _mm_add_epi32(s0, s1);
704 }
705 
VaddlHi16(const __m128i src0,const __m128i src1)706 inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) {
707   const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
708   const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
709   return _mm_add_epi32(s0, s1);
710 }
711 
VaddwLo8(const __m128i src0,const __m128i src1)712 inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
713   const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
714   return _mm_add_epi16(src0, s1);
715 }
716 
VaddwHi8(const __m128i src0,const __m128i src1)717 inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) {
718   const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
719   return _mm_add_epi16(src0, s1);
720 }
721 
VaddwLo16(const __m128i src0,const __m128i src1)722 inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) {
723   const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
724   return _mm_add_epi32(src0, s1);
725 }
726 
VaddwHi16(const __m128i src0,const __m128i src1)727 inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) {
728   const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
729   return _mm_add_epi32(src0, s1);
730 }
731 
VmullNLo8(const __m128i src0,const int src1)732 inline __m128i VmullNLo8(const __m128i src0, const int src1) {
733   const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
734   return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
735 }
736 
VmullNHi8(const __m128i src0,const int src1)737 inline __m128i VmullNHi8(const __m128i src0, const int src1) {
738   const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
739   return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
740 }
741 
VmullLo16(const __m128i src0,const __m128i src1)742 inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
743   const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
744   const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
745   return _mm_madd_epi16(s0, s1);
746 }
747 
VmullHi16(const __m128i src0,const __m128i src1)748 inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
749   const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
750   const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
751   return _mm_madd_epi16(s0, s1);
752 }
753 
VrshrS32(const __m128i src0,const int src1)754 inline __m128i VrshrS32(const __m128i src0, const int src1) {
755   const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
756   return _mm_srai_epi32(sum, src1);
757 }
758 
VrshrU32(const __m128i src0,const int src1)759 inline __m128i VrshrU32(const __m128i src0, const int src1) {
760   const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
761   return _mm_srli_epi32(sum, src1);
762 }
763 
SquareLo8(const __m128i src)764 inline __m128i SquareLo8(const __m128i src) {
765   const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
766   return _mm_mullo_epi16(s, s);
767 }
768 
SquareHi8(const __m128i src)769 inline __m128i SquareHi8(const __m128i src) {
770   const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
771   return _mm_mullo_epi16(s, s);
772 }
773 
Prepare3Lo8(const __m128i src,__m128i dst[3])774 inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
775   dst[0] = src;
776   dst[1] = _mm_srli_si128(src, 1);
777   dst[2] = _mm_srli_si128(src, 2);
778 }
779 
780 template <int offset>
Prepare3_8(const __m128i src[2],__m128i dst[3])781 inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
782   dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
783   dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
784   dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
785 }
786 
Prepare3_16(const __m128i src[2],__m128i dst[3])787 inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
788   dst[0] = src[0];
789   dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
790   dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
791 }
792 
Prepare5Lo8(const __m128i src,__m128i dst[5])793 inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
794   dst[0] = src;
795   dst[1] = _mm_srli_si128(src, 1);
796   dst[2] = _mm_srli_si128(src, 2);
797   dst[3] = _mm_srli_si128(src, 3);
798   dst[4] = _mm_srli_si128(src, 4);
799 }
800 
801 template <int offset>
Prepare5_8(const __m128i src[2],__m128i dst[5])802 inline void Prepare5_8(const __m128i src[2], __m128i dst[5]) {
803   dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
804   dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
805   dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
806   dst[3] = _mm_alignr_epi8(src[1], src[0], offset + 3);
807   dst[4] = _mm_alignr_epi8(src[1], src[0], offset + 4);
808 }
809 
Prepare5_16(const __m128i src[2],__m128i dst[5])810 inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
811   Prepare3_16(src, dst);
812   dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
813   dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
814 }
815 
Sum3_16(const __m128i src0,const __m128i src1,const __m128i src2)816 inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
817                        const __m128i src2) {
818   const __m128i sum = _mm_add_epi16(src0, src1);
819   return _mm_add_epi16(sum, src2);
820 }
821 
Sum3_16(const __m128i src[3])822 inline __m128i Sum3_16(const __m128i src[3]) {
823   return Sum3_16(src[0], src[1], src[2]);
824 }
825 
Sum3_32(const __m128i src0,const __m128i src1,const __m128i src2)826 inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
827                        const __m128i src2) {
828   const __m128i sum = _mm_add_epi32(src0, src1);
829   return _mm_add_epi32(sum, src2);
830 }
831 
Sum3_32(const __m128i src[3][2],__m128i dst[2])832 inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
833   dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
834   dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
835 }
836 
Sum3WLo16(const __m128i src[3])837 inline __m128i Sum3WLo16(const __m128i src[3]) {
838   const __m128i sum = VaddlLo8(src[0], src[1]);
839   return VaddwLo8(sum, src[2]);
840 }
841 
Sum3WHi16(const __m128i src[3])842 inline __m128i Sum3WHi16(const __m128i src[3]) {
843   const __m128i sum = VaddlHi8(src[0], src[1]);
844   return VaddwHi8(sum, src[2]);
845 }
846 
Sum3WLo32(const __m128i src[3])847 inline __m128i Sum3WLo32(const __m128i src[3]) {
848   const __m128i sum = VaddlLo16(src[0], src[1]);
849   return VaddwLo16(sum, src[2]);
850 }
851 
Sum3WHi32(const __m128i src[3])852 inline __m128i Sum3WHi32(const __m128i src[3]) {
853   const __m128i sum = VaddlHi16(src[0], src[1]);
854   return VaddwHi16(sum, src[2]);
855 }
856 
Sum5_16(const __m128i src[5])857 inline __m128i Sum5_16(const __m128i src[5]) {
858   const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
859   const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
860   const __m128i sum = _mm_add_epi16(sum01, sum23);
861   return _mm_add_epi16(sum, src[4]);
862 }
863 
Sum5_32(const __m128i * const src0,const __m128i * const src1,const __m128i * const src2,const __m128i * const src3,const __m128i * const src4)864 inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
865                        const __m128i* const src2, const __m128i* const src3,
866                        const __m128i* const src4) {
867   const __m128i sum01 = _mm_add_epi32(*src0, *src1);
868   const __m128i sum23 = _mm_add_epi32(*src2, *src3);
869   const __m128i sum = _mm_add_epi32(sum01, sum23);
870   return _mm_add_epi32(sum, *src4);
871 }
872 
Sum5_32(const __m128i src[5][2],__m128i dst[2])873 inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
874   dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
875   dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
876 }
877 
Sum5WLo16(const __m128i src[5])878 inline __m128i Sum5WLo16(const __m128i src[5]) {
879   const __m128i sum01 = VaddlLo8(src[0], src[1]);
880   const __m128i sum23 = VaddlLo8(src[2], src[3]);
881   const __m128i sum = _mm_add_epi16(sum01, sum23);
882   return VaddwLo8(sum, src[4]);
883 }
884 
Sum5WHi16(const __m128i src[5])885 inline __m128i Sum5WHi16(const __m128i src[5]) {
886   const __m128i sum01 = VaddlHi8(src[0], src[1]);
887   const __m128i sum23 = VaddlHi8(src[2], src[3]);
888   const __m128i sum = _mm_add_epi16(sum01, sum23);
889   return VaddwHi8(sum, src[4]);
890 }
891 
Sum3Horizontal(const __m128i src)892 inline __m128i Sum3Horizontal(const __m128i src) {
893   __m128i s[3];
894   Prepare3Lo8(src, s);
895   return Sum3WLo16(s);
896 }
897 
898 template <int offset>
Sum3Horizontal(const __m128i src[2],__m128i dst[2])899 inline void Sum3Horizontal(const __m128i src[2], __m128i dst[2]) {
900   __m128i s[3];
901   Prepare3_8<offset>(src, s);
902   dst[0] = Sum3WLo16(s);
903   dst[1] = Sum3WHi16(s);
904 }
905 
Sum3WHorizontal(const __m128i src[2],__m128i dst[2])906 inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
907   __m128i s[3];
908   Prepare3_16(src, s);
909   dst[0] = Sum3WLo32(s);
910   dst[1] = Sum3WHi32(s);
911 }
912 
Sum5Horizontal(const __m128i src)913 inline __m128i Sum5Horizontal(const __m128i src) {
914   __m128i s[5];
915   Prepare5Lo8(src, s);
916   return Sum5WLo16(s);
917 }
918 
919 template <int offset>
Sum5Horizontal(const __m128i src[2],__m128i * const dst0,__m128i * const dst1)920 inline void Sum5Horizontal(const __m128i src[2], __m128i* const dst0,
921                            __m128i* const dst1) {
922   __m128i s[5];
923   Prepare5_8<offset>(src, s);
924   *dst0 = Sum5WLo16(s);
925   *dst1 = Sum5WHi16(s);
926 }
927 
Sum5WHorizontal(const __m128i src[2],__m128i dst[2])928 inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
929   __m128i s[5];
930   Prepare5_16(src, s);
931   const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
932   const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
933   const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
934   dst[0] = VaddwLo16(sum0123_lo, s[4]);
935   const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
936   const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
937   const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
938   dst[1] = VaddwHi16(sum0123_hi, s[4]);
939 }
940 
SumHorizontalLo(const __m128i src[5],__m128i * const row_sq3,__m128i * const row_sq5)941 void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
942                      __m128i* const row_sq5) {
943   const __m128i sum04 = VaddlLo16(src[0], src[4]);
944   *row_sq3 = Sum3WLo32(src + 1);
945   *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
946 }
947 
SumHorizontalHi(const __m128i src[5],__m128i * const row_sq3,__m128i * const row_sq5)948 void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
949                      __m128i* const row_sq5) {
950   const __m128i sum04 = VaddlHi16(src[0], src[4]);
951   *row_sq3 = Sum3WHi32(src + 1);
952   *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
953 }
954 
SumHorizontalLo(const __m128i src,__m128i * const row3,__m128i * const row5)955 void SumHorizontalLo(const __m128i src, __m128i* const row3,
956                      __m128i* const row5) {
957   __m128i s[5];
958   Prepare5Lo8(src, s);
959   const __m128i sum04 = VaddlLo8(s[0], s[4]);
960   *row3 = Sum3WLo16(s + 1);
961   *row5 = _mm_add_epi16(sum04, *row3);
962 }
963 
964 template <int offset>
SumHorizontal(const __m128i src[2],__m128i * const row3_0,__m128i * const row3_1,__m128i * const row5_0,__m128i * const row5_1)965 void SumHorizontal(const __m128i src[2], __m128i* const row3_0,
966                    __m128i* const row3_1, __m128i* const row5_0,
967                    __m128i* const row5_1) {
968   __m128i s[5];
969   Prepare5_8<offset>(src, s);
970   const __m128i sum04_lo = VaddlLo8(s[0], s[4]);
971   const __m128i sum04_hi = VaddlHi8(s[0], s[4]);
972   *row3_0 = Sum3WLo16(s + 1);
973   *row3_1 = Sum3WHi16(s + 1);
974   *row5_0 = _mm_add_epi16(sum04_lo, *row3_0);
975   *row5_1 = _mm_add_epi16(sum04_hi, *row3_1);
976 }
977 
SumHorizontal(const __m128i src[2],__m128i * const row_sq3_0,__m128i * const row_sq3_1,__m128i * const row_sq5_0,__m128i * const row_sq5_1)978 inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
979                           __m128i* const row_sq3_1, __m128i* const row_sq5_0,
980                           __m128i* const row_sq5_1) {
981   __m128i s[5];
982   Prepare5_16(src, s);
983   SumHorizontalLo(s, row_sq3_0, row_sq5_0);
984   SumHorizontalHi(s, row_sq3_1, row_sq5_1);
985 }
986 
Sum343Lo(const __m128i ma3[3])987 inline __m128i Sum343Lo(const __m128i ma3[3]) {
988   const __m128i sum = Sum3WLo16(ma3);
989   const __m128i sum3 = Sum3_16(sum, sum, sum);
990   return VaddwLo8(sum3, ma3[1]);
991 }
992 
Sum343Hi(const __m128i ma3[3])993 inline __m128i Sum343Hi(const __m128i ma3[3]) {
994   const __m128i sum = Sum3WHi16(ma3);
995   const __m128i sum3 = Sum3_16(sum, sum, sum);
996   return VaddwHi8(sum3, ma3[1]);
997 }
998 
Sum343WLo(const __m128i src[3])999 inline __m128i Sum343WLo(const __m128i src[3]) {
1000   const __m128i sum = Sum3WLo32(src);
1001   const __m128i sum3 = Sum3_32(sum, sum, sum);
1002   return VaddwLo16(sum3, src[1]);
1003 }
1004 
Sum343WHi(const __m128i src[3])1005 inline __m128i Sum343WHi(const __m128i src[3]) {
1006   const __m128i sum = Sum3WHi32(src);
1007   const __m128i sum3 = Sum3_32(sum, sum, sum);
1008   return VaddwHi16(sum3, src[1]);
1009 }
1010 
Sum343W(const __m128i src[2],__m128i dst[2])1011 inline void Sum343W(const __m128i src[2], __m128i dst[2]) {
1012   __m128i s[3];
1013   Prepare3_16(src, s);
1014   dst[0] = Sum343WLo(s);
1015   dst[1] = Sum343WHi(s);
1016 }
1017 
Sum565Lo(const __m128i src[3])1018 inline __m128i Sum565Lo(const __m128i src[3]) {
1019   const __m128i sum = Sum3WLo16(src);
1020   const __m128i sum4 = _mm_slli_epi16(sum, 2);
1021   const __m128i sum5 = _mm_add_epi16(sum4, sum);
1022   return VaddwLo8(sum5, src[1]);
1023 }
1024 
Sum565Hi(const __m128i src[3])1025 inline __m128i Sum565Hi(const __m128i src[3]) {
1026   const __m128i sum = Sum3WHi16(src);
1027   const __m128i sum4 = _mm_slli_epi16(sum, 2);
1028   const __m128i sum5 = _mm_add_epi16(sum4, sum);
1029   return VaddwHi8(sum5, src[1]);
1030 }
1031 
Sum565WLo(const __m128i src[3])1032 inline __m128i Sum565WLo(const __m128i src[3]) {
1033   const __m128i sum = Sum3WLo32(src);
1034   const __m128i sum4 = _mm_slli_epi32(sum, 2);
1035   const __m128i sum5 = _mm_add_epi32(sum4, sum);
1036   return VaddwLo16(sum5, src[1]);
1037 }
1038 
Sum565WHi(const __m128i src[3])1039 inline __m128i Sum565WHi(const __m128i src[3]) {
1040   const __m128i sum = Sum3WHi32(src);
1041   const __m128i sum4 = _mm_slli_epi32(sum, 2);
1042   const __m128i sum5 = _mm_add_epi32(sum4, sum);
1043   return VaddwHi16(sum5, src[1]);
1044 }
1045 
Sum565W(const __m128i src[2],__m128i dst[2])1046 inline void Sum565W(const __m128i src[2], __m128i dst[2]) {
1047   __m128i s[3];
1048   Prepare3_16(src, s);
1049   dst[0] = Sum565WLo(s);
1050   dst[1] = Sum565WHi(s);
1051 }
1052 
BoxSum(const uint8_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const ptrdiff_t sum_stride,const ptrdiff_t sum_width,uint16_t * sum3,uint16_t * sum5,uint32_t * square_sum3,uint32_t * square_sum5)1053 inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
1054                    const ptrdiff_t width, const ptrdiff_t sum_stride,
1055                    const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
1056                    uint32_t* square_sum3, uint32_t* square_sum5) {
1057   int y = 2;
1058   do {
1059     __m128i s[2], sq[3];
1060     s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass1 - width);
1061     sq[0] = SquareLo8(s[0]);
1062     ptrdiff_t x = sum_width;
1063     do {
1064       __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
1065       x -= 16;
1066       src += 16;
1067       s[1] = LoadUnaligned16Msan(src,
1068                                  sum_width - x + kOverreadInBytesPass1 - width);
1069       sq[1] = SquareHi8(s[0]);
1070       sq[2] = SquareLo8(s[1]);
1071       SumHorizontal<0>(s, &row3[0], &row3[1], &row5[0], &row5[1]);
1072       StoreAligned32U16(sum3, row3);
1073       StoreAligned32U16(sum5, row5);
1074       SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
1075       StoreAligned32U32(square_sum3 + 0, row_sq3);
1076       StoreAligned32U32(square_sum5 + 0, row_sq5);
1077       SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
1078       StoreAligned32U32(square_sum3 + 8, row_sq3);
1079       StoreAligned32U32(square_sum5 + 8, row_sq5);
1080       s[0] = s[1];
1081       sq[0] = sq[2];
1082       sum3 += 16;
1083       sum5 += 16;
1084       square_sum3 += 16;
1085       square_sum5 += 16;
1086     } while (x != 0);
1087     src += src_stride - sum_width;
1088     sum3 += sum_stride - sum_width;
1089     sum5 += sum_stride - sum_width;
1090     square_sum3 += sum_stride - sum_width;
1091     square_sum5 += sum_stride - sum_width;
1092   } while (--y != 0);
1093 }
1094 
1095 template <int size>
BoxSum(const uint8_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const ptrdiff_t sum_stride,const ptrdiff_t sum_width,uint16_t * sums,uint32_t * square_sums)1096 inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
1097                    const ptrdiff_t width, const ptrdiff_t sum_stride,
1098                    const ptrdiff_t sum_width, uint16_t* sums,
1099                    uint32_t* square_sums) {
1100   static_assert(size == 3 || size == 5, "");
1101   constexpr int kOverreadInBytes =
1102       (size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2;
1103   int y = 2;
1104   do {
1105     __m128i s[2], sq[3];
1106     s[0] = LoadUnaligned16Msan(src, kOverreadInBytes - width);
1107     sq[0] = SquareLo8(s[0]);
1108     ptrdiff_t x = sum_width;
1109     do {
1110       __m128i row[2], row_sq[4];
1111       x -= 16;
1112       src += 16;
1113       s[1] = LoadUnaligned16Msan(src, sum_width - x + kOverreadInBytes - width);
1114       sq[1] = SquareHi8(s[0]);
1115       sq[2] = SquareLo8(s[1]);
1116       if (size == 3) {
1117         Sum3Horizontal<0>(s, row);
1118         Sum3WHorizontal(sq + 0, row_sq + 0);
1119         Sum3WHorizontal(sq + 1, row_sq + 2);
1120       } else {
1121         Sum5Horizontal<0>(s, &row[0], &row[1]);
1122         Sum5WHorizontal(sq + 0, row_sq + 0);
1123         Sum5WHorizontal(sq + 1, row_sq + 2);
1124       }
1125       StoreAligned32U16(sums, row);
1126       StoreAligned64U32(square_sums, row_sq);
1127       s[0] = s[1];
1128       sq[0] = sq[2];
1129       sums += 16;
1130       square_sums += 16;
1131     } while (x != 0);
1132     src += src_stride - sum_width;
1133     sums += sum_stride - sum_width;
1134     square_sums += sum_stride - sum_width;
1135   } while (--y != 0);
1136 }
1137 
1138 template <int n>
CalculateMa(const __m128i sum,const __m128i sum_sq,const uint32_t scale)1139 inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
1140                            const uint32_t scale) {
1141   static_assert(n == 9 || n == 25, "");
1142   // a = |sum_sq|
1143   // d = |sum|
1144   // p = (a * n < d * d) ? 0 : a * n - d * d;
1145   const __m128i dxd = _mm_madd_epi16(sum, sum);
1146   // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
1147   // Some compilers could do this for us but we make this explicit.
1148   // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
1149   __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
1150   if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
1151   const __m128i sub = _mm_sub_epi32(axn, dxd);
1152   const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
1153   const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
1154   return VrshrU32(pxs, kSgrProjScaleBits);
1155 }
1156 
1157 template <int n>
CalculateMa(const __m128i sum,const __m128i sum_sq[2],const uint32_t scale)1158 inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
1159                            const uint32_t scale) {
1160   static_assert(n == 9 || n == 25, "");
1161   const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
1162   const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
1163   const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
1164   const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
1165   return _mm_packus_epi32(z0, z1);
1166 }
1167 
CalculateB5(const __m128i sum,const __m128i ma)1168 inline __m128i CalculateB5(const __m128i sum, const __m128i ma) {
1169   // one_over_n == 164.
1170   constexpr uint32_t one_over_n =
1171       ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
1172   // one_over_n_quarter == 41.
1173   constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
1174   static_assert(one_over_n == one_over_n_quarter << 2, "");
1175   // |ma| is in range [0, 255].
1176   const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
1177   const __m128i m0 = VmullLo16(m, sum);
1178   const __m128i m1 = VmullHi16(m, sum);
1179   const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
1180   const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
1181   return _mm_packus_epi32(b_lo, b_hi);
1182 }
1183 
CalculateB3(const __m128i sum,const __m128i ma)1184 inline __m128i CalculateB3(const __m128i sum, const __m128i ma) {
1185   // one_over_n == 455.
1186   constexpr uint32_t one_over_n =
1187       ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
1188   const __m128i m0 = VmullLo16(ma, sum);
1189   const __m128i m1 = VmullHi16(ma, sum);
1190   const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
1191   const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
1192   const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
1193   const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
1194   return _mm_packus_epi32(b_lo, b_hi);
1195 }
1196 
CalculateSumAndIndex5(const __m128i s5[5],const __m128i sq5[5][2],const uint32_t scale,__m128i * const sum,__m128i * const index)1197 inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
1198                                   const uint32_t scale, __m128i* const sum,
1199                                   __m128i* const index) {
1200   __m128i sum_sq[2];
1201   *sum = Sum5_16(s5);
1202   Sum5_32(sq5, sum_sq);
1203   *index = CalculateMa<25>(*sum, sum_sq, scale);
1204 }
1205 
CalculateSumAndIndex3(const __m128i s3[3],const __m128i sq3[3][2],const uint32_t scale,__m128i * const sum,__m128i * const index)1206 inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
1207                                   const uint32_t scale, __m128i* const sum,
1208                                   __m128i* const index) {
1209   __m128i sum_sq[2];
1210   *sum = Sum3_16(s3);
1211   Sum3_32(sq3, sum_sq);
1212   *index = CalculateMa<9>(*sum, sum_sq, scale);
1213 }
1214 
1215 template <int n, int offset>
LookupIntermediate(const __m128i sum,const __m128i index,__m128i * const ma,__m128i * const b)1216 inline void LookupIntermediate(const __m128i sum, const __m128i index,
1217                                __m128i* const ma, __m128i* const b) {
1218   static_assert(n == 9 || n == 25, "");
1219   static_assert(offset == 0 || offset == 8, "");
1220   const __m128i idx = _mm_packus_epi16(index, index);
1221   // Actually it's not stored and loaded. The compiler will use a 64-bit
1222   // general-purpose register to process. Faster than using _mm_extract_epi8().
1223   uint8_t temp[8];
1224   StoreLo8(temp, idx);
1225   *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
1226   *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
1227   *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
1228   *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
1229   *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
1230   *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
1231   *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
1232   *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
1233   // b = ma * b * one_over_n
1234   // |ma| = [0, 255]
1235   // |sum| is a box sum with radius 1 or 2.
1236   // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
1237   // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
1238   // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
1239   // When radius is 2 |n| is 25. |one_over_n| is 164.
1240   // When radius is 1 |n| is 9. |one_over_n| is 455.
1241   // |kSgrProjReciprocalBits| is 12.
1242   // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
1243   // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
1244   __m128i maq;
1245   if (offset == 0) {
1246     maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
1247   } else {
1248     maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
1249   }
1250   *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq);
1251 }
1252 
1253 // Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
1254 // to get value 0 as the shuffle result. The most significiant bit 1 comes
1255 // either from the comparison instruction, or from the sign bit of the index.
ShuffleIndex(const __m128i table,const __m128i index)1256 inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
1257   __m128i mask;
1258   mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
1259   mask = _mm_or_si128(mask, index);
1260   return _mm_shuffle_epi8(table, mask);
1261 }
1262 
AdjustValue(const __m128i value,const __m128i index,const int threshold)1263 inline __m128i AdjustValue(const __m128i value, const __m128i index,
1264                            const int threshold) {
1265   const __m128i thresholds = _mm_set1_epi8(threshold - 128);
1266   const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
1267   return _mm_add_epi8(value, offset);
1268 }
1269 
CalculateIntermediate(const __m128i sum[2],const __m128i index[2],__m128i * const ma,__m128i * const b0,__m128i * const b1)1270 inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
1271                                   __m128i* const ma, __m128i* const b0,
1272                                   __m128i* const b1) {
1273   // Use table lookup to read elements whose indices are less than 48.
1274   const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
1275   const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
1276   const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
1277   const __m128i indices = _mm_packus_epi16(index[0], index[1]);
1278   __m128i idx;
1279   // Clip idx to 127 to apply signed comparison instructions.
1280   idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
1281   // All elements whose indices are less than 48 are set to 0.
1282   // Get shuffle results for indices in range [0, 15].
1283   *ma = ShuffleIndex(c0, idx);
1284   // Get shuffle results for indices in range [16, 31].
1285   // Subtract 16 to utilize the sign bit of the index.
1286   idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
1287   const __m128i res1 = ShuffleIndex(c1, idx);
1288   // Use OR instruction to combine shuffle results together.
1289   *ma = _mm_or_si128(*ma, res1);
1290   // Get shuffle results for indices in range [32, 47].
1291   // Subtract 16 to utilize the sign bit of the index.
1292   idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
1293   const __m128i res2 = ShuffleIndex(c2, idx);
1294   *ma = _mm_or_si128(*ma, res2);
1295 
1296   // For elements whose indices are larger than 47, since they seldom change
1297   // values with the increase of the index, we use comparison and arithmetic
1298   // operations to calculate their values.
1299   // Add -128 to apply signed comparison instructions.
1300   idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
1301   // Elements whose indices are larger than 47 (with value 0) are set to 5.
1302   *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
1303   *ma = AdjustValue(*ma, idx, 55);   // 55 is the last index which value is 5.
1304   *ma = AdjustValue(*ma, idx, 72);   // 72 is the last index which value is 4.
1305   *ma = AdjustValue(*ma, idx, 101);  // 101 is the last index which value is 3.
1306   *ma = AdjustValue(*ma, idx, 169);  // 169 is the last index which value is 2.
1307   *ma = AdjustValue(*ma, idx, 254);  // 254 is the last index which value is 1.
1308 
1309   // b = ma * b * one_over_n
1310   // |ma| = [0, 255]
1311   // |sum| is a box sum with radius 1 or 2.
1312   // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
1313   // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
1314   // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
1315   // When radius is 2 |n| is 25. |one_over_n| is 164.
1316   // When radius is 1 |n| is 9. |one_over_n| is 455.
1317   // |kSgrProjReciprocalBits| is 12.
1318   // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
1319   // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
1320   const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
1321   *b0 = CalculateB3(sum[0], maq0);
1322   const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
1323   *b1 = CalculateB3(sum[1], maq1);
1324 }
1325 
CalculateIntermediate(const __m128i sum[2],const __m128i index[2],__m128i ma[2],__m128i b[2])1326 inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
1327                                   __m128i ma[2], __m128i b[2]) {
1328   __m128i mas;
1329   CalculateIntermediate(sum, index, &mas, &b[0], &b[1]);
1330   ma[0] = _mm_unpacklo_epi64(ma[0], mas);
1331   ma[1] = _mm_srli_si128(mas, 8);
1332 }
1333 
1334 // Note: It has been tried to call CalculateIntermediate() to replace the slow
1335 // LookupIntermediate() when calculating 16 intermediate data points. However,
1336 // the compiler generates even slower code.
1337 template <int offset>
CalculateIntermediate5(const __m128i s5[5],const __m128i sq5[5][2],const uint32_t scale,__m128i * const ma,__m128i * const b)1338 inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
1339                                    const uint32_t scale, __m128i* const ma,
1340                                    __m128i* const b) {
1341   static_assert(offset == 0 || offset == 8, "");
1342   __m128i sum, index;
1343   CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
1344   LookupIntermediate<25, offset>(sum, index, ma, b);
1345 }
1346 
CalculateIntermediate3(const __m128i s3[3],const __m128i sq3[3][2],const uint32_t scale,__m128i * const ma,__m128i * const b)1347 inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
1348                                    const uint32_t scale, __m128i* const ma,
1349                                    __m128i* const b) {
1350   __m128i sum, index;
1351   CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
1352   LookupIntermediate<9, 0>(sum, index, ma, b);
1353 }
1354 
Store343_444(const __m128i b3[2],const ptrdiff_t x,__m128i sum_b343[2],__m128i sum_b444[2],uint32_t * const b343,uint32_t * const b444)1355 inline void Store343_444(const __m128i b3[2], const ptrdiff_t x,
1356                          __m128i sum_b343[2], __m128i sum_b444[2],
1357                          uint32_t* const b343, uint32_t* const b444) {
1358   __m128i b[3], sum_b111[2];
1359   Prepare3_16(b3, b);
1360   sum_b111[0] = Sum3WLo32(b);
1361   sum_b111[1] = Sum3WHi32(b);
1362   sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
1363   sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
1364   StoreAligned32U32(b444 + x, sum_b444);
1365   sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
1366   sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
1367   sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
1368   sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
1369   StoreAligned32U32(b343 + x, sum_b343);
1370 }
1371 
Store343_444Lo(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,__m128i * const sum_ma343,__m128i * const sum_ma444,__m128i sum_b343[2],__m128i sum_b444[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1372 inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
1373                            const ptrdiff_t x, __m128i* const sum_ma343,
1374                            __m128i* const sum_ma444, __m128i sum_b343[2],
1375                            __m128i sum_b444[2], uint16_t* const ma343,
1376                            uint16_t* const ma444, uint32_t* const b343,
1377                            uint32_t* const b444) {
1378   const __m128i sum_ma111 = Sum3WLo16(ma3);
1379   *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
1380   StoreAligned16(ma444 + x, *sum_ma444);
1381   const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
1382   *sum_ma343 = VaddwLo8(sum333, ma3[1]);
1383   StoreAligned16(ma343 + x, *sum_ma343);
1384   Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
1385 }
1386 
Store343_444Hi(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,__m128i * const sum_ma343,__m128i * const sum_ma444,__m128i sum_b343[2],__m128i sum_b444[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1387 inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
1388                            const ptrdiff_t x, __m128i* const sum_ma343,
1389                            __m128i* const sum_ma444, __m128i sum_b343[2],
1390                            __m128i sum_b444[2], uint16_t* const ma343,
1391                            uint16_t* const ma444, uint32_t* const b343,
1392                            uint32_t* const b444) {
1393   const __m128i sum_ma111 = Sum3WHi16(ma3);
1394   *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
1395   StoreAligned16(ma444 + x, *sum_ma444);
1396   const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
1397   *sum_ma343 = VaddwHi8(sum333, ma3[1]);
1398   StoreAligned16(ma343 + x, *sum_ma343);
1399   Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
1400 }
1401 
Store343_444Lo(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,__m128i * const sum_ma343,__m128i sum_b343[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1402 inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
1403                            const ptrdiff_t x, __m128i* const sum_ma343,
1404                            __m128i sum_b343[2], uint16_t* const ma343,
1405                            uint16_t* const ma444, uint32_t* const b343,
1406                            uint32_t* const b444) {
1407   __m128i sum_ma444, sum_b444[2];
1408   Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
1409                  ma444, b343, b444);
1410 }
1411 
Store343_444Hi(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,__m128i * const sum_ma343,__m128i sum_b343[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1412 inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
1413                            const ptrdiff_t x, __m128i* const sum_ma343,
1414                            __m128i sum_b343[2], uint16_t* const ma343,
1415                            uint16_t* const ma444, uint32_t* const b343,
1416                            uint32_t* const b444) {
1417   __m128i sum_ma444, sum_b444[2];
1418   Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
1419                  ma444, b343, b444);
1420 }
1421 
Store343_444Lo(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1422 inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
1423                            const ptrdiff_t x, uint16_t* const ma343,
1424                            uint16_t* const ma444, uint32_t* const b343,
1425                            uint32_t* const b444) {
1426   __m128i sum_ma343, sum_b343[2];
1427   Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
1428 }
1429 
Store343_444Hi(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1430 inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
1431                            const ptrdiff_t x, uint16_t* const ma343,
1432                            uint16_t* const ma444, uint32_t* const b343,
1433                            uint32_t* const b444) {
1434   __m128i sum_ma343, sum_b343[2];
1435   Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
1436 }
1437 
BoxFilterPreProcess5Lo(const __m128i s[2][2],const uint32_t scale,uint16_t * const sum5[5],uint32_t * const square_sum5[5],__m128i sq[2][4],__m128i * const ma,__m128i * const b)1438 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
1439     const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5],
1440     uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma,
1441     __m128i* const b) {
1442   __m128i s5[2][5], sq5[5][2];
1443   sq[0][1] = SquareHi8(s[0][0]);
1444   sq[1][1] = SquareHi8(s[1][0]);
1445   s5[0][3] = Sum5Horizontal(s[0][0]);
1446   StoreAligned16(sum5[3], s5[0][3]);
1447   s5[0][4] = Sum5Horizontal(s[1][0]);
1448   StoreAligned16(sum5[4], s5[0][4]);
1449   Sum5WHorizontal(sq[0], sq5[3]);
1450   StoreAligned32U32(square_sum5[3], sq5[3]);
1451   Sum5WHorizontal(sq[1], sq5[4]);
1452   StoreAligned32U32(square_sum5[4], sq5[4]);
1453   LoadAligned16x3U16(sum5, 0, s5[0]);
1454   LoadAligned32x3U32(square_sum5, 0, sq5);
1455   CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
1456 }
1457 
BoxFilterPreProcess5(const __m128i s[2][2],const ptrdiff_t sum_width,const ptrdiff_t x,const uint32_t scale,uint16_t * const sum5[5],uint32_t * const square_sum5[5],__m128i sq[2][4],__m128i ma[2],__m128i b[3])1458 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
1459     const __m128i s[2][2], const ptrdiff_t sum_width, const ptrdiff_t x,
1460     const uint32_t scale, uint16_t* const sum5[5],
1461     uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma[2],
1462     __m128i b[3]) {
1463   __m128i s5[2][5], sq5[5][2];
1464   sq[0][2] = SquareLo8(s[0][1]);
1465   sq[1][2] = SquareLo8(s[1][1]);
1466   Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
1467   StoreAligned16(sum5[3] + x + 0, s5[0][3]);
1468   StoreAligned16(sum5[3] + x + 8, s5[1][3]);
1469   Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
1470   StoreAligned16(sum5[4] + x + 0, s5[0][4]);
1471   StoreAligned16(sum5[4] + x + 8, s5[1][4]);
1472   Sum5WHorizontal(sq[0] + 1, sq5[3]);
1473   StoreAligned32U32(square_sum5[3] + x, sq5[3]);
1474   Sum5WHorizontal(sq[1] + 1, sq5[4]);
1475   StoreAligned32U32(square_sum5[4] + x, sq5[4]);
1476   LoadAligned16x3U16(sum5, x, s5[0]);
1477   LoadAligned32x3U32(square_sum5, x, sq5);
1478   CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
1479 
1480   sq[0][3] = SquareHi8(s[0][1]);
1481   sq[1][3] = SquareHi8(s[1][1]);
1482   Sum5WHorizontal(sq[0] + 2, sq5[3]);
1483   StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
1484   Sum5WHorizontal(sq[1] + 2, sq5[4]);
1485   StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
1486   LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1487   LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1488   CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
1489 }
1490 
BoxFilterPreProcess5LastRowLo(const __m128i s,const uint32_t scale,const uint16_t * const sum5[5],const uint32_t * const square_sum5[5],__m128i sq[2],__m128i * const ma,__m128i * const b)1491 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
1492     const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
1493     const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
1494     __m128i* const b) {
1495   __m128i s5[5], sq5[5][2];
1496   sq[1] = SquareHi8(s);
1497   s5[3] = s5[4] = Sum5Horizontal(s);
1498   Sum5WHorizontal(sq, sq5[3]);
1499   sq5[4][0] = sq5[3][0];
1500   sq5[4][1] = sq5[3][1];
1501   LoadAligned16x3U16(sum5, 0, s5);
1502   LoadAligned32x3U32(square_sum5, 0, sq5);
1503   CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
1504 }
1505 
BoxFilterPreProcess5LastRow(const __m128i s[2],const ptrdiff_t sum_width,const ptrdiff_t x,const uint32_t scale,const uint16_t * const sum5[5],const uint32_t * const square_sum5[5],__m128i sq[4],__m128i ma[2],__m128i b[3])1506 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
1507     const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
1508     const uint32_t scale, const uint16_t* const sum5[5],
1509     const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma[2],
1510     __m128i b[3]) {
1511   __m128i s5[2][5], sq5[5][2];
1512   sq[2] = SquareLo8(s[1]);
1513   Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
1514   s5[0][4] = s5[0][3];
1515   s5[1][4] = s5[1][3];
1516   Sum5WHorizontal(sq + 1, sq5[3]);
1517   sq5[4][0] = sq5[3][0];
1518   sq5[4][1] = sq5[3][1];
1519   LoadAligned16x3U16(sum5, x, s5[0]);
1520   LoadAligned32x3U32(square_sum5, x, sq5);
1521   CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
1522 
1523   sq[3] = SquareHi8(s[1]);
1524   Sum5WHorizontal(sq + 2, sq5[3]);
1525   sq5[4][0] = sq5[3][0];
1526   sq5[4][1] = sq5[3][1];
1527   LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1528   LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1529   CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
1530 }
1531 
BoxFilterPreProcess3Lo(const __m128i s,const uint32_t scale,uint16_t * const sum3[3],uint32_t * const square_sum3[3],__m128i sq[2],__m128i * const ma,__m128i * const b)1532 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
1533     const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
1534     uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
1535     __m128i* const b) {
1536   __m128i s3[3], sq3[3][2];
1537   sq[1] = SquareHi8(s);
1538   s3[2] = Sum3Horizontal(s);
1539   StoreAligned16(sum3[2], s3[2]);
1540   Sum3WHorizontal(sq, sq3[2]);
1541   StoreAligned32U32(square_sum3[2], sq3[2]);
1542   LoadAligned16x2U16(sum3, 0, s3);
1543   LoadAligned32x2U32(square_sum3, 0, sq3);
1544   CalculateIntermediate3(s3, sq3, scale, ma, b);
1545 }
1546 
BoxFilterPreProcess3(const __m128i s[2],const ptrdiff_t x,const ptrdiff_t sum_width,const uint32_t scale,uint16_t * const sum3[3],uint32_t * const square_sum3[3],__m128i sq[4],__m128i ma[2],__m128i b[3])1547 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
1548     const __m128i s[2], const ptrdiff_t x, const ptrdiff_t sum_width,
1549     const uint32_t scale, uint16_t* const sum3[3],
1550     uint32_t* const square_sum3[3], __m128i sq[4], __m128i ma[2],
1551     __m128i b[3]) {
1552   __m128i s3[4], sq3[3][2], sum[2], index[2];
1553   sq[2] = SquareLo8(s[1]);
1554   Sum3Horizontal<8>(s, s3 + 2);
1555   StoreAligned32U16(sum3[2] + x, s3 + 2);
1556   Sum3WHorizontal(sq + 1, sq3[2]);
1557   StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
1558   LoadAligned16x2U16(sum3, x, s3);
1559   LoadAligned32x2U32(square_sum3, x, sq3);
1560   CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
1561 
1562   sq[3] = SquareHi8(s[1]);
1563   Sum3WHorizontal(sq + 2, sq3[2]);
1564   StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
1565   LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
1566   LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
1567   CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
1568   CalculateIntermediate(sum, index, ma, b + 1);
1569 }
1570 
BoxFilterPreProcessLo(const __m128i s[2][2],const uint16_t scales[2],uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],__m128i sq[2][4],__m128i ma3[2][2],__m128i b3[2][3],__m128i * const ma5,__m128i * const b5)1571 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
1572     const __m128i s[2][2], const uint16_t scales[2], uint16_t* const sum3[4],
1573     uint16_t* const sum5[5], uint32_t* const square_sum3[4],
1574     uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma3[2][2],
1575     __m128i b3[2][3], __m128i* const ma5, __m128i* const b5) {
1576   __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
1577   sq[0][1] = SquareHi8(s[0][0]);
1578   sq[1][1] = SquareHi8(s[1][0]);
1579   SumHorizontalLo(s[0][0], &s3[2], &s5[3]);
1580   SumHorizontalLo(s[1][0], &s3[3], &s5[4]);
1581   StoreAligned16(sum3[2], s3[2]);
1582   StoreAligned16(sum3[3], s3[3]);
1583   StoreAligned16(sum5[3], s5[3]);
1584   StoreAligned16(sum5[4], s5[4]);
1585   SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1586   StoreAligned32U32(square_sum3[2], sq3[2]);
1587   StoreAligned32U32(square_sum5[3], sq5[3]);
1588   SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
1589   StoreAligned32U32(square_sum3[3], sq3[3]);
1590   StoreAligned32U32(square_sum5[4], sq5[4]);
1591   LoadAligned16x2U16(sum3, 0, s3);
1592   LoadAligned32x2U32(square_sum3, 0, sq3);
1593   LoadAligned16x3U16(sum5, 0, s5);
1594   LoadAligned32x3U32(square_sum5, 0, sq5);
1595   CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
1596   CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
1597   CalculateIntermediate(sum, index, &ma3[0][0], &b3[0][0], &b3[1][0]);
1598   ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
1599   CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
1600 }
1601 
BoxFilterPreProcess(const __m128i s[2][2],const ptrdiff_t x,const uint16_t scales[2],uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,__m128i sq[2][4],__m128i ma3[2][2],__m128i b3[2][3],__m128i ma5[2],__m128i b5[3])1602 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
1603     const __m128i s[2][2], const ptrdiff_t x, const uint16_t scales[2],
1604     uint16_t* const sum3[4], uint16_t* const sum5[5],
1605     uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
1606     const ptrdiff_t sum_width, __m128i sq[2][4], __m128i ma3[2][2],
1607     __m128i b3[2][3], __m128i ma5[2], __m128i b5[3]) {
1608   __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
1609   SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
1610   StoreAligned16(sum3[2] + x + 0, s3[0][2]);
1611   StoreAligned16(sum3[2] + x + 8, s3[1][2]);
1612   StoreAligned16(sum5[3] + x + 0, s5[0][3]);
1613   StoreAligned16(sum5[3] + x + 8, s5[1][3]);
1614   SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
1615   StoreAligned16(sum3[3] + x + 0, s3[0][3]);
1616   StoreAligned16(sum3[3] + x + 8, s3[1][3]);
1617   StoreAligned16(sum5[4] + x + 0, s5[0][4]);
1618   StoreAligned16(sum5[4] + x + 8, s5[1][4]);
1619   sq[0][2] = SquareLo8(s[0][1]);
1620   sq[1][2] = SquareLo8(s[1][1]);
1621   SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1622   StoreAligned32U32(square_sum3[2] + x, sq3[2]);
1623   StoreAligned32U32(square_sum5[3] + x, sq5[3]);
1624   SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
1625   StoreAligned32U32(square_sum3[3] + x, sq3[3]);
1626   StoreAligned32U32(square_sum5[4] + x, sq5[4]);
1627   LoadAligned16x2U16(sum3, x, s3[0]);
1628   LoadAligned32x2U32(square_sum3, x, sq3);
1629   CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
1630   CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
1631                         &index[1][0]);
1632   LoadAligned16x3U16(sum5, x, s5[0]);
1633   LoadAligned32x3U32(square_sum5, x, sq5);
1634   CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[1]);
1635 
1636   sq[0][3] = SquareHi8(s[0][1]);
1637   sq[1][3] = SquareHi8(s[1][1]);
1638   SumHorizontal(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1639   StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
1640   StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
1641   SumHorizontal(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
1642   StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
1643   StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
1644   LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
1645   LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
1646   CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
1647   CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
1648                         &index[1][1]);
1649   CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 1);
1650   CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 1);
1651   LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1652   LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1653   CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[2]);
1654 }
1655 
BoxFilterPreProcessLastRowLo(const __m128i s,const uint16_t scales[2],const uint16_t * const sum3[4],const uint16_t * const sum5[5],const uint32_t * const square_sum3[4],const uint32_t * const square_sum5[5],__m128i sq[2],__m128i * const ma3,__m128i * const ma5,__m128i * const b3,__m128i * const b5)1656 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
1657     const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
1658     const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
1659     const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
1660     __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
1661   __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
1662   sq[1] = SquareHi8(s);
1663   SumHorizontalLo(s, &s3[2], &s5[3]);
1664   SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1665   LoadAligned16x3U16(sum5, 0, s5);
1666   s5[4] = s5[3];
1667   LoadAligned32x3U32(square_sum5, 0, sq5);
1668   sq5[4][0] = sq5[3][0];
1669   sq5[4][1] = sq5[3][1];
1670   CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
1671   LoadAligned16x2U16(sum3, 0, s3);
1672   LoadAligned32x2U32(square_sum3, 0, sq3);
1673   CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
1674 }
1675 
BoxFilterPreProcessLastRow(const __m128i s[2],const ptrdiff_t sum_width,const ptrdiff_t x,const uint16_t scales[2],const uint16_t * const sum3[4],const uint16_t * const sum5[5],const uint32_t * const square_sum3[4],const uint32_t * const square_sum5[5],__m128i sq[4],__m128i ma3[2],__m128i ma5[2],__m128i b3[3],__m128i b5[3])1676 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
1677     const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
1678     const uint16_t scales[2], const uint16_t* const sum3[4],
1679     const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
1680     const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma3[2],
1681     __m128i ma5[2], __m128i b3[3], __m128i b5[3]) {
1682   __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
1683   sq[2] = SquareLo8(s[1]);
1684   SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
1685   SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1686   LoadAligned16x3U16(sum5, x, s5[0]);
1687   s5[0][4] = s5[0][3];
1688   LoadAligned32x3U32(square_sum5, x, sq5);
1689   sq5[4][0] = sq5[3][0];
1690   sq5[4][1] = sq5[3][1];
1691   CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 1);
1692   LoadAligned16x2U16(sum3, x, s3[0]);
1693   LoadAligned32x2U32(square_sum3, x, sq3);
1694   CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
1695 
1696   sq[3] = SquareHi8(s[1]);
1697   SumHorizontal(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1698   LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1699   s5[1][4] = s5[1][3];
1700   LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1701   sq5[4][0] = sq5[3][0];
1702   sq5[4][1] = sq5[3][1];
1703   CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 2);
1704   LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
1705   LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
1706   CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
1707   CalculateIntermediate(sum, index, ma3, b3 + 1);
1708 }
1709 
BoxSumFilterPreProcess5(const uint8_t * const src0,const uint8_t * const src1,const int width,const uint32_t scale,uint16_t * const sum5[5],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,uint16_t * ma565,uint32_t * b565)1710 inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
1711                                     const uint8_t* const src1, const int width,
1712                                     const uint32_t scale,
1713                                     uint16_t* const sum5[5],
1714                                     uint32_t* const square_sum5[5],
1715                                     const ptrdiff_t sum_width, uint16_t* ma565,
1716                                     uint32_t* b565) {
1717   __m128i s[2][2], mas[2], sq[2][4], bs[3];
1718   s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
1719   s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
1720   sq[0][0] = SquareLo8(s[0][0]);
1721   sq[1][0] = SquareLo8(s[1][0]);
1722   BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
1723 
1724   int x = 0;
1725   do {
1726     __m128i ma5[3], ma[2], b[4];
1727     s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
1728                                   x + 16 + kOverreadInBytesPass1 - width);
1729     s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
1730                                   x + 16 + kOverreadInBytesPass1 - width);
1731     BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
1732                          bs);
1733     Prepare3_8<0>(mas, ma5);
1734     ma[0] = Sum565Lo(ma5);
1735     ma[1] = Sum565Hi(ma5);
1736     StoreAligned32U16(ma565, ma);
1737     Sum565W(bs + 0, b + 0);
1738     Sum565W(bs + 1, b + 2);
1739     StoreAligned64U32(b565, b);
1740     s[0][0] = s[0][1];
1741     s[1][0] = s[1][1];
1742     sq[0][1] = sq[0][3];
1743     sq[1][1] = sq[1][3];
1744     mas[0] = mas[1];
1745     bs[0] = bs[2];
1746     ma565 += 16;
1747     b565 += 16;
1748     x += 16;
1749   } while (x < width);
1750 }
1751 
1752 template <bool calculate444>
BoxSumFilterPreProcess3(const uint8_t * const src,const int width,const uint32_t scale,uint16_t * const sum3[3],uint32_t * const square_sum3[3],const ptrdiff_t sum_width,uint16_t * ma343,uint16_t * ma444,uint32_t * b343,uint32_t * b444)1753 LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
1754     const uint8_t* const src, const int width, const uint32_t scale,
1755     uint16_t* const sum3[3], uint32_t* const square_sum3[3],
1756     const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
1757     uint32_t* b444) {
1758   __m128i s[2], mas[2], sq[4], bs[3];
1759   s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass2 - width);
1760   sq[0] = SquareLo8(s[0]);
1761   BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
1762 
1763   int x = 0;
1764   do {
1765     s[1] = LoadUnaligned16Msan(src + x + 16,
1766                                x + 16 + kOverreadInBytesPass2 - width);
1767     BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
1768                          bs);
1769     __m128i ma3[3];
1770     Prepare3_8<0>(mas, ma3);
1771     if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
1772       Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
1773       Store343_444Hi(ma3, bs + 1, 8, ma343, ma444, b343, b444);
1774       ma444 += 16;
1775       b444 += 16;
1776     } else {
1777       __m128i ma[2], b[4];
1778       ma[0] = Sum343Lo(ma3);
1779       ma[1] = Sum343Hi(ma3);
1780       StoreAligned32U16(ma343, ma);
1781       Sum343W(bs + 0, b + 0);
1782       Sum343W(bs + 1, b + 2);
1783       StoreAligned64U32(b343, b);
1784     }
1785     s[0] = s[1];
1786     sq[1] = sq[3];
1787     mas[0] = mas[1];
1788     bs[0] = bs[2];
1789     ma343 += 16;
1790     b343 += 16;
1791     x += 16;
1792   } while (x < width);
1793 }
1794 
BoxSumFilterPreProcess(const uint8_t * const src0,const uint8_t * const src1,const int width,const uint16_t scales[2],uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,uint16_t * const ma343[4],uint16_t * const ma444,uint16_t * ma565,uint32_t * const b343[4],uint32_t * const b444,uint32_t * b565)1795 inline void BoxSumFilterPreProcess(
1796     const uint8_t* const src0, const uint8_t* const src1, const int width,
1797     const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
1798     uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
1799     const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
1800     uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
1801     uint32_t* b565) {
1802   __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
1803   s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
1804   s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
1805   sq[0][0] = SquareLo8(s[0][0]);
1806   sq[1][0] = SquareLo8(s[1][0]);
1807   BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
1808                         ma3, b3, &ma5[0], &b5[0]);
1809 
1810   int x = 0;
1811   do {
1812     __m128i ma[2], b[4], ma3x[3], ma5x[3];
1813     s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
1814                                   x + 16 + kOverreadInBytesPass1 - width);
1815     s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
1816                                   x + 16 + kOverreadInBytesPass1 - width);
1817     BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
1818                         sum_width, sq, ma3, b3, ma5, b5);
1819 
1820     Prepare3_8<0>(ma3[0], ma3x);
1821     ma[0] = Sum343Lo(ma3x);
1822     ma[1] = Sum343Hi(ma3x);
1823     StoreAligned32U16(ma343[0] + x, ma);
1824     Sum343W(b3[0] + 0, b + 0);
1825     Sum343W(b3[0] + 1, b + 2);
1826     StoreAligned64U32(b343[0] + x, b);
1827     Sum565W(b5 + 0, b + 0);
1828     Sum565W(b5 + 1, b + 2);
1829     StoreAligned64U32(b565, b);
1830     Prepare3_8<0>(ma3[1], ma3x);
1831     Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
1832     Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
1833     Prepare3_8<0>(ma5, ma5x);
1834     ma[0] = Sum565Lo(ma5x);
1835     ma[1] = Sum565Hi(ma5x);
1836     StoreAligned32U16(ma565, ma);
1837     s[0][0] = s[0][1];
1838     s[1][0] = s[1][1];
1839     sq[0][1] = sq[0][3];
1840     sq[1][1] = sq[1][3];
1841     ma3[0][0] = ma3[0][1];
1842     ma3[1][0] = ma3[1][1];
1843     ma5[0] = ma5[1];
1844     b3[0][0] = b3[0][2];
1845     b3[1][0] = b3[1][2];
1846     b5[0] = b5[2];
1847     ma565 += 16;
1848     b565 += 16;
1849     x += 16;
1850   } while (x < width);
1851 }
1852 
1853 template <int shift>
FilterOutput(const __m128i ma_x_src,const __m128i b)1854 inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) {
1855   // ma: 255 * 32 = 8160 (13 bits)
1856   // b: 65088 * 32 = 2082816 (21 bits)
1857   // v: b - ma * 255 (22 bits)
1858   const __m128i v = _mm_sub_epi32(b, ma_x_src);
1859   // kSgrProjSgrBits = 8
1860   // kSgrProjRestoreBits = 4
1861   // shift = 4 or 5
1862   // v >> 8 or 9 (13 bits)
1863   return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
1864 }
1865 
1866 template <int shift>
CalculateFilteredOutput(const __m128i src,const __m128i ma,const __m128i b[2])1867 inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
1868                                        const __m128i b[2]) {
1869   const __m128i ma_x_src_lo = VmullLo16(ma, src);
1870   const __m128i ma_x_src_hi = VmullHi16(ma, src);
1871   const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
1872   const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
1873   return _mm_packs_epi32(dst_lo, dst_hi);  // 13 bits
1874 }
1875 
CalculateFilteredOutputPass1(const __m128i src,const __m128i ma[2],const __m128i b[2][2])1876 inline __m128i CalculateFilteredOutputPass1(const __m128i src,
1877                                             const __m128i ma[2],
1878                                             const __m128i b[2][2]) {
1879   const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
1880   __m128i b_sum[2];
1881   b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
1882   b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
1883   return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
1884 }
1885 
CalculateFilteredOutputPass2(const __m128i src,const __m128i ma[3],const __m128i b[3][2])1886 inline __m128i CalculateFilteredOutputPass2(const __m128i src,
1887                                             const __m128i ma[3],
1888                                             const __m128i b[3][2]) {
1889   const __m128i ma_sum = Sum3_16(ma);
1890   __m128i b_sum[2];
1891   Sum3_32(b, b_sum);
1892   return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
1893 }
1894 
SelfGuidedFinal(const __m128i src,const __m128i v[2])1895 inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
1896   const __m128i v_lo =
1897       VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
1898   const __m128i v_hi =
1899       VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
1900   const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
1901   return _mm_add_epi16(src, vv);
1902 }
1903 
SelfGuidedDoubleMultiplier(const __m128i src,const __m128i filter[2],const int w0,const int w2)1904 inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
1905                                           const __m128i filter[2], const int w0,
1906                                           const int w2) {
1907   __m128i v[2];
1908   const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
1909   const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
1910   const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
1911   v[0] = _mm_madd_epi16(w0_w2, f_lo);
1912   v[1] = _mm_madd_epi16(w0_w2, f_hi);
1913   return SelfGuidedFinal(src, v);
1914 }
1915 
SelfGuidedSingleMultiplier(const __m128i src,const __m128i filter,const int w0)1916 inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
1917                                           const __m128i filter, const int w0) {
1918   // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
1919   __m128i v[2];
1920   v[0] = VmullNLo8(filter, w0);
1921   v[1] = VmullNHi8(filter, w0);
1922   return SelfGuidedFinal(src, v);
1923 }
1924 
BoxFilterPass1(const uint8_t * const src,const uint8_t * const src0,const uint8_t * const src1,const ptrdiff_t stride,uint16_t * const sum5[5],uint32_t * const square_sum5[5],const int width,const ptrdiff_t sum_width,const uint32_t scale,const int16_t w0,uint16_t * const ma565[2],uint32_t * const b565[2],uint8_t * const dst)1925 LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
1926     const uint8_t* const src, const uint8_t* const src0,
1927     const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
1928     uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
1929     const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
1930     uint32_t* const b565[2], uint8_t* const dst) {
1931   __m128i s[2][2], mas[2], sq[2][4], bs[3];
1932   s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
1933   s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
1934   sq[0][0] = SquareLo8(s[0][0]);
1935   sq[1][0] = SquareLo8(s[1][0]);
1936   BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
1937 
1938   int x = 0;
1939   do {
1940     __m128i ma[2], ma5[3], b[2][2], sr[2], p[2];
1941     s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
1942                                   x + 16 + kOverreadInBytesPass1 - width);
1943     s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
1944                                   x + 16 + kOverreadInBytesPass1 - width);
1945     BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
1946                          bs);
1947     Prepare3_8<0>(mas, ma5);
1948     ma[1] = Sum565Lo(ma5);
1949     StoreAligned16(ma565[1] + x, ma[1]);
1950     Sum565W(bs, b[1]);
1951     StoreAligned32U32(b565[1] + x, b[1]);
1952     sr[0] = LoadAligned16(src + x);
1953     sr[1] = LoadAligned16(src + stride + x);
1954     const __m128i sr0_lo = _mm_unpacklo_epi8(sr[0], _mm_setzero_si128());
1955     const __m128i sr1_lo = _mm_unpacklo_epi8(sr[1], _mm_setzero_si128());
1956     ma[0] = LoadAligned16(ma565[0] + x);
1957     LoadAligned32U32(b565[0] + x, b[0]);
1958     p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
1959     p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
1960     const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
1961     const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
1962 
1963     ma[1] = Sum565Hi(ma5);
1964     StoreAligned16(ma565[1] + x + 8, ma[1]);
1965     Sum565W(bs + 1, b[1]);
1966     StoreAligned32U32(b565[1] + x + 8, b[1]);
1967     const __m128i sr0_hi = _mm_unpackhi_epi8(sr[0], _mm_setzero_si128());
1968     const __m128i sr1_hi = _mm_unpackhi_epi8(sr[1], _mm_setzero_si128());
1969     ma[0] = LoadAligned16(ma565[0] + x + 8);
1970     LoadAligned32U32(b565[0] + x + 8, b[0]);
1971     p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
1972     p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
1973     const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
1974     StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
1975     const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
1976     StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
1977     s[0][0] = s[0][1];
1978     s[1][0] = s[1][1];
1979     sq[0][1] = sq[0][3];
1980     sq[1][1] = sq[1][3];
1981     mas[0] = mas[1];
1982     bs[0] = bs[2];
1983     x += 16;
1984   } while (x < width);
1985 }
1986 
BoxFilterPass1LastRow(const uint8_t * const src,const uint8_t * const src0,const int width,const ptrdiff_t sum_width,const uint32_t scale,const int16_t w0,uint16_t * const sum5[5],uint32_t * const square_sum5[5],uint16_t * ma565,uint32_t * b565,uint8_t * const dst)1987 inline void BoxFilterPass1LastRow(
1988     const uint8_t* const src, const uint8_t* const src0, const int width,
1989     const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
1990     uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
1991     uint32_t* b565, uint8_t* const dst) {
1992   __m128i s[2], mas[2], sq[4], bs[3];
1993   s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
1994   sq[0] = SquareLo8(s[0]);
1995   BoxFilterPreProcess5LastRowLo(s[0], scale, sum5, square_sum5, sq, &mas[0],
1996                                 &bs[0]);
1997 
1998   int x = 0;
1999   do {
2000     __m128i ma[2], ma5[3], b[2][2];
2001     s[1] = LoadUnaligned16Msan(src0 + x + 16,
2002                                x + 16 + kOverreadInBytesPass1 - width);
2003     BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
2004                                 sq, mas, bs);
2005     Prepare3_8<0>(mas, ma5);
2006     ma[1] = Sum565Lo(ma5);
2007     Sum565W(bs, b[1]);
2008     ma[0] = LoadAligned16(ma565);
2009     LoadAligned32U32(b565, b[0]);
2010     const __m128i sr = LoadAligned16(src + x);
2011     const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
2012     __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
2013     const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
2014 
2015     ma[1] = Sum565Hi(ma5);
2016     Sum565W(bs + 1, b[1]);
2017     ma[0] = LoadAligned16(ma565 + 8);
2018     LoadAligned32U32(b565 + 8, b[0]);
2019     const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
2020     p = CalculateFilteredOutputPass1(sr_hi, ma, b);
2021     const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
2022     StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
2023     s[0] = s[1];
2024     sq[1] = sq[3];
2025     mas[0] = mas[1];
2026     bs[0] = bs[2];
2027     ma565 += 16;
2028     b565 += 16;
2029     x += 16;
2030   } while (x < width);
2031 }
2032 
BoxFilterPass2(const uint8_t * const src,const uint8_t * const src0,const int width,const ptrdiff_t sum_width,const uint32_t scale,const int16_t w0,uint16_t * const sum3[3],uint32_t * const square_sum3[3],uint16_t * const ma343[3],uint16_t * const ma444[2],uint32_t * const b343[3],uint32_t * const b444[2],uint8_t * const dst)2033 LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
2034     const uint8_t* const src, const uint8_t* const src0, const int width,
2035     const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
2036     uint16_t* const sum3[3], uint32_t* const square_sum3[3],
2037     uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
2038     uint32_t* const b444[2], uint8_t* const dst) {
2039   __m128i s[2], mas[2], sq[4], bs[3];
2040   s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass2 - width);
2041   sq[0] = SquareLo8(s[0]);
2042   BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
2043 
2044   int x = 0;
2045   do {
2046     s[1] = LoadUnaligned16Msan(src0 + x + 16,
2047                                x + 16 + kOverreadInBytesPass2 - width);
2048     BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
2049                          bs);
2050     __m128i ma[3], b[3][2], ma3[3];
2051     Prepare3_8<0>(mas, ma3);
2052     Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
2053                    b444[1]);
2054     const __m128i sr = LoadAligned16(src + x);
2055     const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
2056     ma[0] = LoadAligned16(ma343[0] + x);
2057     ma[1] = LoadAligned16(ma444[0] + x);
2058     LoadAligned32U32(b343[0] + x, b[0]);
2059     LoadAligned32U32(b444[0] + x, b[1]);
2060     const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
2061 
2062     Store343_444Hi(ma3, bs + 1, x + 8, &ma[2], b[2], ma343[2], ma444[1],
2063                    b343[2], b444[1]);
2064     const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
2065     ma[0] = LoadAligned16(ma343[0] + x + 8);
2066     ma[1] = LoadAligned16(ma444[0] + x + 8);
2067     LoadAligned32U32(b343[0] + x + 8, b[0]);
2068     LoadAligned32U32(b444[0] + x + 8, b[1]);
2069     const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
2070     const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
2071     const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
2072     StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
2073     s[0] = s[1];
2074     sq[1] = sq[3];
2075     mas[0] = mas[1];
2076     bs[0] = bs[2];
2077     x += 16;
2078   } while (x < width);
2079 }
2080 
BoxFilter(const uint8_t * const src,const uint8_t * const src0,const uint8_t * const src1,const ptrdiff_t stride,const int width,const uint16_t scales[2],const int16_t w0,const int16_t w2,uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,uint16_t * const ma343[4],uint16_t * const ma444[3],uint16_t * const ma565[2],uint32_t * const b343[4],uint32_t * const b444[3],uint32_t * const b565[2],uint8_t * const dst)2081 LIBGAV1_ALWAYS_INLINE void BoxFilter(
2082     const uint8_t* const src, const uint8_t* const src0,
2083     const uint8_t* const src1, const ptrdiff_t stride, const int width,
2084     const uint16_t scales[2], const int16_t w0, const int16_t w2,
2085     uint16_t* const sum3[4], uint16_t* const sum5[5],
2086     uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
2087     const ptrdiff_t sum_width, uint16_t* const ma343[4],
2088     uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
2089     uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
2090   __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
2091   s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
2092   s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
2093   sq[0][0] = SquareLo8(s[0][0]);
2094   sq[1][0] = SquareLo8(s[1][0]);
2095   BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
2096                         ma3, b3, &ma5[0], &b5[0]);
2097 
2098   int x = 0;
2099   do {
2100     __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
2101     s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
2102                                   x + 16 + kOverreadInBytesPass1 - width);
2103     s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
2104                                   x + 16 + kOverreadInBytesPass1 - width);
2105     BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
2106                         sum_width, sq, ma3, b3, ma5, b5);
2107     Prepare3_8<0>(ma3[0], ma3x[0]);
2108     Prepare3_8<0>(ma3[1], ma3x[1]);
2109     Prepare3_8<0>(ma5, ma5x);
2110     Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
2111                    ma343[2], ma444[1], b343[2], b444[1]);
2112     Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
2113                    b343[3], b444[2]);
2114     ma[0][1] = Sum565Lo(ma5x);
2115     StoreAligned16(ma565[1] + x, ma[0][1]);
2116     Sum565W(b5, b[0][1]);
2117     StoreAligned32U32(b565[1] + x, b[0][1]);
2118     const __m128i sr0 = LoadAligned16(src + x);
2119     const __m128i sr1 = LoadAligned16(src + stride + x);
2120     const __m128i sr0_lo = _mm_unpacklo_epi8(sr0, _mm_setzero_si128());
2121     const __m128i sr1_lo = _mm_unpacklo_epi8(sr1, _mm_setzero_si128());
2122     ma[0][0] = LoadAligned16(ma565[0] + x);
2123     LoadAligned32U32(b565[0] + x, b[0][0]);
2124     p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
2125     p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
2126     ma[1][0] = LoadAligned16(ma343[0] + x);
2127     ma[1][1] = LoadAligned16(ma444[0] + x);
2128     LoadAligned32U32(b343[0] + x, b[1][0]);
2129     LoadAligned32U32(b444[0] + x, b[1][1]);
2130     p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
2131     const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
2132     ma[2][0] = LoadAligned16(ma343[1] + x);
2133     LoadAligned32U32(b343[1] + x, b[2][0]);
2134     p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
2135     const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
2136 
2137     Store343_444Hi(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], b[1][2],
2138                    b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
2139     Store343_444Hi(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], b[2][2], ma343[3],
2140                    ma444[2], b343[3], b444[2]);
2141     ma[0][1] = Sum565Hi(ma5x);
2142     StoreAligned16(ma565[1] + x + 8, ma[0][1]);
2143     Sum565W(b5 + 1, b[0][1]);
2144     StoreAligned32U32(b565[1] + x + 8, b[0][1]);
2145     const __m128i sr0_hi = _mm_unpackhi_epi8(sr0, _mm_setzero_si128());
2146     const __m128i sr1_hi = _mm_unpackhi_epi8(sr1, _mm_setzero_si128());
2147     ma[0][0] = LoadAligned16(ma565[0] + x + 8);
2148     LoadAligned32U32(b565[0] + x + 8, b[0][0]);
2149     p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
2150     p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
2151     ma[1][0] = LoadAligned16(ma343[0] + x + 8);
2152     ma[1][1] = LoadAligned16(ma444[0] + x + 8);
2153     LoadAligned32U32(b343[0] + x + 8, b[1][0]);
2154     LoadAligned32U32(b444[0] + x + 8, b[1][1]);
2155     p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
2156     const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
2157     StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
2158     ma[2][0] = LoadAligned16(ma343[1] + x + 8);
2159     LoadAligned32U32(b343[1] + x + 8, b[2][0]);
2160     p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
2161     const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
2162     StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
2163     s[0][0] = s[0][1];
2164     s[1][0] = s[1][1];
2165     sq[0][1] = sq[0][3];
2166     sq[1][1] = sq[1][3];
2167     ma3[0][0] = ma3[0][1];
2168     ma3[1][0] = ma3[1][1];
2169     ma5[0] = ma5[1];
2170     b3[0][0] = b3[0][2];
2171     b3[1][0] = b3[1][2];
2172     b5[0] = b5[2];
2173     x += 16;
2174   } while (x < width);
2175 }
2176 
BoxFilterLastRow(const uint8_t * const src,const uint8_t * const src0,const int width,const ptrdiff_t sum_width,const uint16_t scales[2],const int16_t w0,const int16_t w2,uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],uint16_t * const ma343,uint16_t * const ma444,uint16_t * const ma565,uint32_t * const b343,uint32_t * const b444,uint32_t * const b565,uint8_t * const dst)2177 inline void BoxFilterLastRow(
2178     const uint8_t* const src, const uint8_t* const src0, const int width,
2179     const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
2180     const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
2181     uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
2182     uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
2183     uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
2184     uint8_t* const dst) {
2185   __m128i s[2], ma3[2], ma5[2], sq[4], b3[3], b5[3], ma[3], b[3][2];
2186   s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
2187   sq[0] = SquareLo8(s[0]);
2188   BoxFilterPreProcessLastRowLo(s[0], scales, sum3, sum5, square_sum3,
2189                                square_sum5, sq, &ma3[0], &ma5[0], &b3[0],
2190                                &b5[0]);
2191 
2192   int x = 0;
2193   do {
2194     __m128i ma3x[3], ma5x[3], p[2];
2195     s[1] = LoadUnaligned16Msan(src0 + x + 16,
2196                                x + 16 + kOverreadInBytesPass1 - width);
2197     BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
2198                                square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
2199     Prepare3_8<0>(ma3, ma3x);
2200     Prepare3_8<0>(ma5, ma5x);
2201     ma[1] = Sum565Lo(ma5x);
2202     Sum565W(b5, b[1]);
2203     ma[2] = Sum343Lo(ma3x);
2204     Sum343W(b3, b[2]);
2205     const __m128i sr = LoadAligned16(src + x);
2206     const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
2207     ma[0] = LoadAligned16(ma565 + x);
2208     LoadAligned32U32(b565 + x, b[0]);
2209     p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
2210     ma[0] = LoadAligned16(ma343 + x);
2211     ma[1] = LoadAligned16(ma444 + x);
2212     LoadAligned32U32(b343 + x, b[0]);
2213     LoadAligned32U32(b444 + x, b[1]);
2214     p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
2215     const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
2216 
2217     ma[1] = Sum565Hi(ma5x);
2218     Sum565W(b5 + 1, b[1]);
2219     ma[2] = Sum343Hi(ma3x);
2220     Sum343W(b3 + 1, b[2]);
2221     const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
2222     ma[0] = LoadAligned16(ma565 + x + 8);
2223     LoadAligned32U32(b565 + x + 8, b[0]);
2224     p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
2225     ma[0] = LoadAligned16(ma343 + x + 8);
2226     ma[1] = LoadAligned16(ma444 + x + 8);
2227     LoadAligned32U32(b343 + x + 8, b[0]);
2228     LoadAligned32U32(b444 + x + 8, b[1]);
2229     p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
2230     const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
2231     StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
2232     s[0] = s[1];
2233     sq[1] = sq[3];
2234     ma3[0] = ma3[1];
2235     ma5[0] = ma5[1];
2236     b3[0] = b3[2];
2237     b5[0] = b5[2];
2238     x += 16;
2239   } while (x < width);
2240 }
2241 
BoxFilterProcess(const RestorationUnitInfo & restoration_info,const uint8_t * src,const ptrdiff_t stride,const uint8_t * const top_border,const ptrdiff_t top_border_stride,const uint8_t * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,uint8_t * dst)2242 LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
2243     const RestorationUnitInfo& restoration_info, const uint8_t* src,
2244     const ptrdiff_t stride, const uint8_t* const top_border,
2245     const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
2246     const ptrdiff_t bottom_border_stride, const int width, const int height,
2247     SgrBuffer* const sgr_buffer, uint8_t* dst) {
2248   const auto temp_stride = Align<ptrdiff_t>(width, 16);
2249   const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
2250   const auto sum_stride = temp_stride + 16;
2251   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
2252   const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
2253   const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
2254   const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
2255   const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
2256   uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
2257   uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
2258   sum3[0] = sgr_buffer->sum3;
2259   square_sum3[0] = sgr_buffer->square_sum3;
2260   ma343[0] = sgr_buffer->ma343;
2261   b343[0] = sgr_buffer->b343;
2262   for (int i = 1; i <= 3; ++i) {
2263     sum3[i] = sum3[i - 1] + sum_stride;
2264     square_sum3[i] = square_sum3[i - 1] + sum_stride;
2265     ma343[i] = ma343[i - 1] + temp_stride;
2266     b343[i] = b343[i - 1] + temp_stride;
2267   }
2268   sum5[0] = sgr_buffer->sum5;
2269   square_sum5[0] = sgr_buffer->square_sum5;
2270   for (int i = 1; i <= 4; ++i) {
2271     sum5[i] = sum5[i - 1] + sum_stride;
2272     square_sum5[i] = square_sum5[i - 1] + sum_stride;
2273   }
2274   ma444[0] = sgr_buffer->ma444;
2275   b444[0] = sgr_buffer->b444;
2276   for (int i = 1; i <= 2; ++i) {
2277     ma444[i] = ma444[i - 1] + temp_stride;
2278     b444[i] = b444[i - 1] + temp_stride;
2279   }
2280   ma565[0] = sgr_buffer->ma565;
2281   ma565[1] = ma565[0] + temp_stride;
2282   b565[0] = sgr_buffer->b565;
2283   b565[1] = b565[0] + temp_stride;
2284   assert(scales[0] != 0);
2285   assert(scales[1] != 0);
2286   BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
2287          sum5[1], square_sum3[0], square_sum5[1]);
2288   sum5[0] = sum5[1];
2289   square_sum5[0] = square_sum5[1];
2290   const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
2291   BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
2292                          square_sum5, sum_width, ma343, ma444[0], ma565[0],
2293                          b343, b444[0], b565[0]);
2294   sum5[0] = sgr_buffer->sum5;
2295   square_sum5[0] = sgr_buffer->square_sum5;
2296 
2297   for (int y = (height >> 1) - 1; y > 0; --y) {
2298     Circulate4PointersBy2<uint16_t>(sum3);
2299     Circulate4PointersBy2<uint32_t>(square_sum3);
2300     Circulate5PointersBy2<uint16_t>(sum5);
2301     Circulate5PointersBy2<uint32_t>(square_sum5);
2302     BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
2303               scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
2304               ma343, ma444, ma565, b343, b444, b565, dst);
2305     src += 2 * stride;
2306     dst += 2 * stride;
2307     Circulate4PointersBy2<uint16_t>(ma343);
2308     Circulate4PointersBy2<uint32_t>(b343);
2309     std::swap(ma444[0], ma444[2]);
2310     std::swap(b444[0], b444[2]);
2311     std::swap(ma565[0], ma565[1]);
2312     std::swap(b565[0], b565[1]);
2313   }
2314 
2315   Circulate4PointersBy2<uint16_t>(sum3);
2316   Circulate4PointersBy2<uint32_t>(square_sum3);
2317   Circulate5PointersBy2<uint16_t>(sum5);
2318   Circulate5PointersBy2<uint32_t>(square_sum5);
2319   if ((height & 1) == 0 || height > 1) {
2320     const uint8_t* sr[2];
2321     if ((height & 1) == 0) {
2322       sr[0] = bottom_border;
2323       sr[1] = bottom_border + bottom_border_stride;
2324     } else {
2325       sr[0] = src + 2 * stride;
2326       sr[1] = bottom_border;
2327     }
2328     BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
2329               square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
2330               b444, b565, dst);
2331   }
2332   if ((height & 1) != 0) {
2333     if (height > 1) {
2334       src += 2 * stride;
2335       dst += 2 * stride;
2336       Circulate4PointersBy2<uint16_t>(sum3);
2337       Circulate4PointersBy2<uint32_t>(square_sum3);
2338       Circulate5PointersBy2<uint16_t>(sum5);
2339       Circulate5PointersBy2<uint32_t>(square_sum5);
2340       Circulate4PointersBy2<uint16_t>(ma343);
2341       Circulate4PointersBy2<uint32_t>(b343);
2342       std::swap(ma444[0], ma444[2]);
2343       std::swap(b444[0], b444[2]);
2344       std::swap(ma565[0], ma565[1]);
2345       std::swap(b565[0], b565[1]);
2346     }
2347     BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
2348                      sum_width, scales, w0, w2, sum3, sum5, square_sum3,
2349                      square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
2350                      b444[0], b565[0], dst);
2351   }
2352 }
2353 
BoxFilterProcessPass1(const RestorationUnitInfo & restoration_info,const uint8_t * src,const ptrdiff_t stride,const uint8_t * const top_border,const ptrdiff_t top_border_stride,const uint8_t * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,uint8_t * dst)2354 inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
2355                                   const uint8_t* src, const ptrdiff_t stride,
2356                                   const uint8_t* const top_border,
2357                                   const ptrdiff_t top_border_stride,
2358                                   const uint8_t* bottom_border,
2359                                   const ptrdiff_t bottom_border_stride,
2360                                   const int width, const int height,
2361                                   SgrBuffer* const sgr_buffer, uint8_t* dst) {
2362   const auto temp_stride = Align<ptrdiff_t>(width, 16);
2363   const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
2364   const auto sum_stride = temp_stride + 16;
2365   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
2366   const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
2367   const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
2368   uint16_t *sum5[5], *ma565[2];
2369   uint32_t *square_sum5[5], *b565[2];
2370   sum5[0] = sgr_buffer->sum5;
2371   square_sum5[0] = sgr_buffer->square_sum5;
2372   for (int i = 1; i <= 4; ++i) {
2373     sum5[i] = sum5[i - 1] + sum_stride;
2374     square_sum5[i] = square_sum5[i - 1] + sum_stride;
2375   }
2376   ma565[0] = sgr_buffer->ma565;
2377   ma565[1] = ma565[0] + temp_stride;
2378   b565[0] = sgr_buffer->b565;
2379   b565[1] = b565[0] + temp_stride;
2380   assert(scale != 0);
2381   BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
2382             sum5[1], square_sum5[1]);
2383   sum5[0] = sum5[1];
2384   square_sum5[0] = square_sum5[1];
2385   const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
2386   BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
2387                           ma565[0], b565[0]);
2388   sum5[0] = sgr_buffer->sum5;
2389   square_sum5[0] = sgr_buffer->square_sum5;
2390 
2391   for (int y = (height >> 1) - 1; y > 0; --y) {
2392     Circulate5PointersBy2<uint16_t>(sum5);
2393     Circulate5PointersBy2<uint32_t>(square_sum5);
2394     BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
2395                    square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
2396     src += 2 * stride;
2397     dst += 2 * stride;
2398     std::swap(ma565[0], ma565[1]);
2399     std::swap(b565[0], b565[1]);
2400   }
2401 
2402   Circulate5PointersBy2<uint16_t>(sum5);
2403   Circulate5PointersBy2<uint32_t>(square_sum5);
2404   if ((height & 1) == 0 || height > 1) {
2405     const uint8_t* sr[2];
2406     if ((height & 1) == 0) {
2407       sr[0] = bottom_border;
2408       sr[1] = bottom_border + bottom_border_stride;
2409     } else {
2410       sr[0] = src + 2 * stride;
2411       sr[1] = bottom_border;
2412     }
2413     BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
2414                    sum_width, scale, w0, ma565, b565, dst);
2415   }
2416   if ((height & 1) != 0) {
2417     src += 3;
2418     if (height > 1) {
2419       src += 2 * stride;
2420       dst += 2 * stride;
2421       std::swap(ma565[0], ma565[1]);
2422       std::swap(b565[0], b565[1]);
2423       Circulate5PointersBy2<uint16_t>(sum5);
2424       Circulate5PointersBy2<uint32_t>(square_sum5);
2425     }
2426     BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
2427                           sum_width, scale, w0, sum5, square_sum5, ma565[0],
2428                           b565[0], dst);
2429   }
2430 }
2431 
BoxFilterProcessPass2(const RestorationUnitInfo & restoration_info,const uint8_t * src,const ptrdiff_t stride,const uint8_t * const top_border,const ptrdiff_t top_border_stride,const uint8_t * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,uint8_t * dst)2432 inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
2433                                   const uint8_t* src, const ptrdiff_t stride,
2434                                   const uint8_t* const top_border,
2435                                   const ptrdiff_t top_border_stride,
2436                                   const uint8_t* bottom_border,
2437                                   const ptrdiff_t bottom_border_stride,
2438                                   const int width, const int height,
2439                                   SgrBuffer* const sgr_buffer, uint8_t* dst) {
2440   assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
2441   const auto temp_stride = Align<ptrdiff_t>(width, 16);
2442   const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
2443   const auto sum_stride = temp_stride + 16;
2444   const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
2445   const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
2446   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
2447   const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
2448   uint16_t *sum3[3], *ma343[3], *ma444[2];
2449   uint32_t *square_sum3[3], *b343[3], *b444[2];
2450   sum3[0] = sgr_buffer->sum3;
2451   square_sum3[0] = sgr_buffer->square_sum3;
2452   ma343[0] = sgr_buffer->ma343;
2453   b343[0] = sgr_buffer->b343;
2454   for (int i = 1; i <= 2; ++i) {
2455     sum3[i] = sum3[i - 1] + sum_stride;
2456     square_sum3[i] = square_sum3[i - 1] + sum_stride;
2457     ma343[i] = ma343[i - 1] + temp_stride;
2458     b343[i] = b343[i - 1] + temp_stride;
2459   }
2460   ma444[0] = sgr_buffer->ma444;
2461   ma444[1] = ma444[0] + temp_stride;
2462   b444[0] = sgr_buffer->b444;
2463   b444[1] = b444[0] + temp_stride;
2464   assert(scale != 0);
2465   BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
2466             sum3[0], square_sum3[0]);
2467   BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
2468                                  sum_width, ma343[0], nullptr, b343[0],
2469                                  nullptr);
2470   Circulate3PointersBy1<uint16_t>(sum3);
2471   Circulate3PointersBy1<uint32_t>(square_sum3);
2472   const uint8_t* s;
2473   if (height > 1) {
2474     s = src + stride;
2475   } else {
2476     s = bottom_border;
2477     bottom_border += bottom_border_stride;
2478   }
2479   BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
2480                                 ma343[1], ma444[0], b343[1], b444[0]);
2481 
2482   for (int y = height - 2; y > 0; --y) {
2483     Circulate3PointersBy1<uint16_t>(sum3);
2484     Circulate3PointersBy1<uint32_t>(square_sum3);
2485     BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
2486                    square_sum3, ma343, ma444, b343, b444, dst);
2487     src += stride;
2488     dst += stride;
2489     Circulate3PointersBy1<uint16_t>(ma343);
2490     Circulate3PointersBy1<uint32_t>(b343);
2491     std::swap(ma444[0], ma444[1]);
2492     std::swap(b444[0], b444[1]);
2493   }
2494 
2495   int y = std::min(height, 2);
2496   src += 2;
2497   do {
2498     Circulate3PointersBy1<uint16_t>(sum3);
2499     Circulate3PointersBy1<uint32_t>(square_sum3);
2500     BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
2501                    square_sum3, ma343, ma444, b343, b444, dst);
2502     src += stride;
2503     dst += stride;
2504     bottom_border += bottom_border_stride;
2505     Circulate3PointersBy1<uint16_t>(ma343);
2506     Circulate3PointersBy1<uint32_t>(b343);
2507     std::swap(ma444[0], ma444[1]);
2508     std::swap(b444[0], b444[1]);
2509   } while (--y != 0);
2510 }
2511 
2512 // If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
2513 // in the end of each row. It is safe to overwrite the output as it will not be
2514 // part of the visible frame.
SelfGuidedFilter_SSE4_1(const RestorationUnitInfo & LIBGAV1_RESTRICT restoration_info,const void * LIBGAV1_RESTRICT const source,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_border,const ptrdiff_t top_border_stride,const void * LIBGAV1_RESTRICT const bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,RestorationBuffer * LIBGAV1_RESTRICT const restoration_buffer,void * LIBGAV1_RESTRICT const dest)2515 void SelfGuidedFilter_SSE4_1(
2516     const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
2517     const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
2518     const void* LIBGAV1_RESTRICT const top_border,
2519     const ptrdiff_t top_border_stride,
2520     const void* LIBGAV1_RESTRICT const bottom_border,
2521     const ptrdiff_t bottom_border_stride, const int width, const int height,
2522     RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
2523     void* LIBGAV1_RESTRICT const dest) {
2524   const int index = restoration_info.sgr_proj_info.index;
2525   const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
2526   const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
2527   const auto* const src = static_cast<const uint8_t*>(source);
2528   const auto* top = static_cast<const uint8_t*>(top_border);
2529   const auto* bottom = static_cast<const uint8_t*>(bottom_border);
2530   auto* const dst = static_cast<uint8_t*>(dest);
2531   SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
2532   if (radius_pass_1 == 0) {
2533     // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
2534     // following assertion.
2535     assert(radius_pass_0 != 0);
2536     BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
2537                           top_border_stride, bottom - 3, bottom_border_stride,
2538                           width, height, sgr_buffer, dst);
2539   } else if (radius_pass_0 == 0) {
2540     BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
2541                           top_border_stride, bottom - 2, bottom_border_stride,
2542                           width, height, sgr_buffer, dst);
2543   } else {
2544     BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
2545                      top_border_stride, bottom - 3, bottom_border_stride, width,
2546                      height, sgr_buffer, dst);
2547   }
2548 }
2549 
Init8bpp()2550 void Init8bpp() {
2551   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
2552   assert(dsp != nullptr);
2553   static_cast<void>(dsp);
2554 #if DSP_ENABLED_8BPP_SSE4_1(WienerFilter)
2555   dsp->loop_restorations[0] = WienerFilter_SSE4_1;
2556 #else
2557   static_cast<void>(WienerFilter_SSE4_1);
2558 #endif
2559 #if DSP_ENABLED_8BPP_SSE4_1(SelfGuidedFilter)
2560   dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
2561 #else
2562   static_cast<void>(SelfGuidedFilter_SSE4_1);
2563 #endif
2564 }
2565 
2566 }  // namespace
2567 }  // namespace low_bitdepth
2568 
LoopRestorationInit_SSE4_1()2569 void LoopRestorationInit_SSE4_1() { low_bitdepth::Init8bpp(); }
2570 
2571 }  // namespace dsp
2572 }  // namespace libgav1
2573 
2574 #else   // !LIBGAV1_TARGETING_SSE4_1
2575 namespace libgav1 {
2576 namespace dsp {
2577 
LoopRestorationInit_SSE4_1()2578 void LoopRestorationInit_SSE4_1() {}
2579 
2580 }  // namespace dsp
2581 }  // namespace libgav1
2582 #endif  // LIBGAV1_TARGETING_SSE4_1
2583