1 // Copyright 2021 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/intrapred_filter.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_TARGETING_SSE4_1
19 
20 #include <xmmintrin.h>
21 
22 #include <algorithm>
23 #include <cassert>
24 #include <cstddef>
25 #include <cstdint>
26 #include <cstring>
27 
28 #include "src/dsp/constants.h"
29 #include "src/dsp/dsp.h"
30 #include "src/dsp/x86/common_sse4.h"
31 #include "src/dsp/x86/transpose_sse4.h"
32 #include "src/utils/common.h"
33 #include "src/utils/constants.h"
34 
35 namespace libgav1 {
36 namespace dsp {
37 namespace {
38 
39 //------------------------------------------------------------------------------
40 // FilterIntraPredictor_SSE4_1
41 // Section 7.11.2.3. Recursive intra prediction process
42 // This filter applies recursively to 4x2 sub-blocks within the transform block,
43 // meaning that the predicted pixels in each sub-block are used as inputs to
44 // sub-blocks below and to the right, if present.
45 //
46 // Each output value in the sub-block is predicted by a different filter applied
47 // to the same array of top-left, top, and left values. If fn refers to the
48 // output of the nth filter, given this block:
49 // TL T0 T1 T2 T3
50 // L0 f0 f1 f2 f3
51 // L1 f4 f5 f6 f7
52 // The filter input order is p0, p1, p2, p3, p4, p5, p6:
53 // p0 p1 p2 p3 p4
54 // p5 f0 f1 f2 f3
55 // p6 f4 f5 f6 f7
56 // Filters usually apply to 8 values for convenience, so in this case we fix
57 // the 8th filter tap to 0 and disregard the value of the 8th input.
58 
59 // This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
60 // duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
61 constexpr int kDuplicateFirstHalf = 0x44;
62 
63 // Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
64 // at zero to preserve the sum.
65 // |pixels| contains p0-p7 in order as shown above.
66 // |taps_0_1| contains the filter kernels used to predict f0 and f1, and so on.
Filter4x2_SSE4_1(uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const __m128i & pixels,const __m128i & taps_0_1,const __m128i & taps_2_3,const __m128i & taps_4_5,const __m128i & taps_6_7)67 inline void Filter4x2_SSE4_1(uint8_t* LIBGAV1_RESTRICT dst,
68                              const ptrdiff_t stride, const __m128i& pixels,
69                              const __m128i& taps_0_1, const __m128i& taps_2_3,
70                              const __m128i& taps_4_5, const __m128i& taps_6_7) {
71   const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
72   const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
73   // |output_half| contains 8 partial sums for f0-f7.
74   __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
75   __m128i output = _mm_hadd_epi16(output_half, output_half);
76   const __m128i output_row0 =
77       _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
78                        /* unused half */ output);
79   Store4(dst, output_row0);
80   const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5);
81   const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7);
82   output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
83   output = _mm_hadd_epi16(output_half, output_half);
84   const __m128i output_row1 =
85       _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
86                        /* arbitrary pack arg */ output);
87   Store4(dst + stride, output_row1);
88 }
89 
90 // 4xH transform sizes are given special treatment because LoadLo8 goes out
91 // of bounds and every block involves the left column. The top-left pixel, p0,
92 // is stored in the top buffer for the first 4x2, but comes from the left buffer
93 // for successive blocks. This implementation takes advantage of the fact
94 // that the p5 and p6 for each sub-block come solely from the |left_ptr| buffer,
95 // using shifts to arrange things to fit reusable shuffle vectors.
Filter4xH(uint8_t * LIBGAV1_RESTRICT dest,ptrdiff_t stride,const uint8_t * LIBGAV1_RESTRICT const top_ptr,const uint8_t * LIBGAV1_RESTRICT const left_ptr,FilterIntraPredictor pred,const int height)96 inline void Filter4xH(uint8_t* LIBGAV1_RESTRICT dest, ptrdiff_t stride,
97                       const uint8_t* LIBGAV1_RESTRICT const top_ptr,
98                       const uint8_t* LIBGAV1_RESTRICT const left_ptr,
99                       FilterIntraPredictor pred, const int height) {
100   // Two filter kernels per vector.
101   const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
102   const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
103   const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
104   const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);
105   __m128i top = Load4(top_ptr - 1);
106   __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4);
107   __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr));
108   left = _mm_slli_si128(left, 5);
109 
110   // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
111   // left[2], left[3], left[4], left[5], left[6], left[7]
112   // Let rn represent a pixel usable as pn for the 4x2 after this one. We get:
113   //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
114   // p0 p1 p2 p3 p4 p5 p6 r5 r6 ...
115   //                   r0
116   pixels = _mm_or_si128(left, pixels);
117 
118   // Two sets of the same input pixels to apply two filters at once.
119   pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
120   Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
121                    taps_6_7);
122   dest += stride;  // Move to y = 1.
123   pixels = Load4(dest);
124 
125   // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
126   // left[0], left[1], ...
127   //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
128   // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
129   //                         r0
130   pixels = _mm_or_si128(left, pixels);
131 
132   // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
133   // byte is an unused value, which shall be multiplied by 0 when we apply the
134   // filter.
135   constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
136 
137   // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
138   const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
139   pixels = _mm_shuffle_epi8(pixels, pixel_order1);
140   dest += stride;  // Move to y = 2.
141   Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
142                    taps_6_7);
143   dest += stride;  // Move to y = 3.
144 
145   // Compute the middle 8 rows before using common code for the final 4 rows, in
146   // order to fit the assumption that |left| has the next TL at position 8.
147   if (height == 16) {
148     // This shift allows us to use pixel_order2 twice after shifting by 2 later.
149     left = _mm_slli_si128(left, 1);
150     pixels = Load4(dest);
151 
152     // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
153     // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
154     //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
155     // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
156     //                                  r0
157     pixels = _mm_or_si128(left, pixels);
158 
159     // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
160     // last byte is an unused value, as above. The top-left was shifted to
161     // position nine to keep two empty spaces after the top pixels.
162     constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
163 
164     // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
165     // the end.
166     const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
167     pixels = _mm_shuffle_epi8(pixels, pixel_order2);
168     dest += stride;  // Move to y = 4.
169 
170     // First 4x2 in the if body.
171     Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
172                      taps_6_7);
173 
174     // Clear all but final pixel in the first 8 of left column.
175     __m128i keep_top_left = _mm_srli_si128(left, 13);
176     dest += stride;  // Move to y = 5.
177     pixels = Load4(dest);
178     left = _mm_srli_si128(left, 2);
179 
180     // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
181     // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
182     //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
183     // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
184     //                                  r0
185     pixels = _mm_or_si128(left, pixels);
186     left = LoadLo8(left_ptr + 8);
187 
188     pixels = _mm_shuffle_epi8(pixels, pixel_order2);
189     dest += stride;  // Move to y = 6.
190 
191     // Second 4x2 in the if body.
192     Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
193                      taps_6_7);
194 
195     // Position TL value so we can use pixel_order1.
196     keep_top_left = _mm_slli_si128(keep_top_left, 6);
197     dest += stride;  // Move to y = 7.
198     pixels = Load4(dest);
199     left = _mm_slli_si128(left, 7);
200     left = _mm_or_si128(left, keep_top_left);
201 
202     // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
203     // left[-1], left[0], left[1], left[2], left[3], ...
204     //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
205     // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
206     //                         r0
207     pixels = _mm_or_si128(left, pixels);
208     pixels = _mm_shuffle_epi8(pixels, pixel_order1);
209     dest += stride;  // Move to y = 8.
210 
211     // Third 4x2 in the if body.
212     Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
213                      taps_6_7);
214     dest += stride;  // Move to y = 9.
215 
216     // Prepare final inputs.
217     pixels = Load4(dest);
218     left = _mm_srli_si128(left, 2);
219 
220     // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
221     // left[-1], left[0], left[1], left[2], left[3], ...
222     //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
223     // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
224     //                         r0
225     pixels = _mm_or_si128(left, pixels);
226     pixels = _mm_shuffle_epi8(pixels, pixel_order1);
227     dest += stride;  // Move to y = 10.
228 
229     // Fourth 4x2 in the if body.
230     Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
231                      taps_6_7);
232     dest += stride;  // Move to y = 11.
233   }
234 
235   // In both the 8 and 16 case at this point, we can assume that |left| has the
236   // next TL at position 8.
237   if (height > 4) {
238     // Erase prior left pixels by shifting TL to position 0.
239     left = _mm_srli_si128(left, 8);
240     left = _mm_slli_si128(left, 6);
241     pixels = Load4(dest);
242 
243     // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
244     // left[-1], left[0], left[1], left[2], left[3], ...
245     //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
246     // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
247     //                         r0
248     pixels = _mm_or_si128(left, pixels);
249     pixels = _mm_shuffle_epi8(pixels, pixel_order1);
250     dest += stride;  // Move to y = 12 or 4.
251 
252     // First of final two 4x2 blocks.
253     Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
254                      taps_6_7);
255     dest += stride;  // Move to y = 13 or 5.
256     pixels = Load4(dest);
257     left = _mm_srli_si128(left, 2);
258 
259     // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
260     // left[-1], left[0], left[1], left[2], left[3], ...
261     //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
262     // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
263     //                         r0
264     pixels = _mm_or_si128(left, pixels);
265     pixels = _mm_shuffle_epi8(pixels, pixel_order1);
266     dest += stride;  // Move to y = 14 or 6.
267 
268     // Last of final two 4x2 blocks.
269     Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
270                      taps_6_7);
271   }
272 }
273 
FilterIntraPredictor_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column,FilterIntraPredictor pred,const int width,const int height)274 void FilterIntraPredictor_SSE4_1(void* LIBGAV1_RESTRICT const dest,
275                                  ptrdiff_t stride,
276                                  const void* LIBGAV1_RESTRICT const top_row,
277                                  const void* LIBGAV1_RESTRICT const left_column,
278                                  FilterIntraPredictor pred, const int width,
279                                  const int height) {
280   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
281   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
282   auto* dst = static_cast<uint8_t*>(dest);
283   if (width == 4) {
284     Filter4xH(dst, stride, top_ptr, left_ptr, pred, height);
285     return;
286   }
287 
288   // There is one set of 7 taps for each of the 4x2 output pixels.
289   const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
290   const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
291   const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
292   const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);
293 
294   // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
295   // the end is an unused value, which shall be multiplied by 0 when we apply
296   // the filter.
297   constexpr int64_t kCondenseLeftMask = 0x0F09080403020100;
298 
299   // Takes the "left section" and puts it right after p0-p4.
300   const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
301 
302   // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
303   // byte is unused as above.
304   constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008;
305 
306   // Shuffles the "top left" from the left section, to the front. Used when
307   // grabbing data from left_column and not top_row.
308   const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
309 
310   // This first pass takes care of the cases where the top left pixel comes from
311   // top_row.
312   __m128i pixels = LoadLo8(top_ptr - 1);
313   __m128i left = _mm_slli_si128(Load4(left_column), 8);
314   pixels = _mm_or_si128(pixels, left);
315 
316   // Two sets of the same pixels to multiply with two sets of taps.
317   pixels = _mm_shuffle_epi8(pixels, pixel_order1);
318   Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7);
319   left = _mm_srli_si128(left, 1);
320 
321   // Load
322   pixels = Load4(dst + stride);
323 
324   // Because of the above shift, this OR 'invades' the final of the first 8
325   // bytes of |pixels|. This is acceptable because the 8th filter tap is always
326   // a padded 0.
327   pixels = _mm_or_si128(pixels, left);
328   pixels = _mm_shuffle_epi8(pixels, pixel_order2);
329   const ptrdiff_t stride2 = stride << 1;
330   const ptrdiff_t stride4 = stride << 2;
331   Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
332                    taps_6_7);
333   dst += 4;
334   for (int x = 3; x < width - 4; x += 4) {
335     pixels = Load4(top_ptr + x);
336     pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4);
337     pixels = _mm_insert_epi8(pixels, dst[-1], 5);
338     pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
339 
340     // Duplicate bottom half into upper half.
341     pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
342     Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
343                      taps_6_7);
344     pixels = Load4(dst + stride - 1);
345     pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
346     pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
347     pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6);
348 
349     // Duplicate bottom half into upper half.
350     pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
351     Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
352                      taps_4_5, taps_6_7);
353     dst += 4;
354   }
355 
356   // Now we handle heights that reference previous blocks rather than top_row.
357   for (int y = 4; y < height; y += 4) {
358     // Leftmost 4x4 block for this height.
359     dst -= width;
360     dst += stride4;
361 
362     // Top Left is not available by offset in these leftmost blocks.
363     pixels = Load4(dst - stride);
364     left = _mm_slli_si128(Load4(left_ptr + y - 1), 8);
365     left = _mm_insert_epi8(left, left_ptr[y + 3], 12);
366     pixels = _mm_or_si128(pixels, left);
367     pixels = _mm_shuffle_epi8(pixels, pixel_order2);
368     Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
369                      taps_6_7);
370 
371     // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
372     left = _mm_srli_si128(left, 2);
373     pixels = Load4(dst + stride);
374     pixels = _mm_or_si128(pixels, left);
375     pixels = _mm_shuffle_epi8(pixels, pixel_order2);
376     Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
377                      taps_4_5, taps_6_7);
378 
379     dst += 4;
380 
381     // Remaining 4x4 blocks for this height.
382     for (int x = 4; x < width; x += 4) {
383       pixels = Load4(dst - stride - 1);
384       pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4);
385       pixels = _mm_insert_epi8(pixels, dst[-1], 5);
386       pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
387 
388       // Duplicate bottom half into upper half.
389       pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
390       Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
391                        taps_6_7);
392       pixels = Load4(dst + stride - 1);
393       pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
394       pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
395       pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6);
396 
397       // Duplicate bottom half into upper half.
398       pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
399       Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
400                        taps_4_5, taps_6_7);
401       dst += 4;
402     }
403   }
404 }
405 
Init8bpp()406 void Init8bpp() {
407   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
408   assert(dsp != nullptr);
409   static_cast<void>(dsp);
410 // These guards check if this version of the function was not superseded by
411 // a higher optimization level, such as AVX. The corresponding #define also
412 // prevents the C version from being added to the table.
413 #if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor)
414   dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1;
415 #endif
416 }
417 
418 }  // namespace
419 
IntraPredFilterInit_SSE4_1()420 void IntraPredFilterInit_SSE4_1() { Init8bpp(); }
421 
422 }  // namespace dsp
423 }  // namespace libgav1
424 
425 #else   // !LIBGAV1_TARGETING_SSE4_1
426 namespace libgav1 {
427 namespace dsp {
428 
IntraPredFilterInit_SSE4_1()429 void IntraPredFilterInit_SSE4_1() {}
430 
431 }  // namespace dsp
432 }  // namespace libgav1
433 #endif  // LIBGAV1_TARGETING_SSE4_1
434