1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/film_grain.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_TARGETING_SSE4_1
19 #include <smmintrin.h>
20 
21 #include <cassert>
22 #include <cstddef>
23 #include <cstdint>
24 #include <cstring>
25 
26 #include "src/dsp/common.h"
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/dsp/film_grain_common.h"
30 #include "src/dsp/x86/common_sse4.h"
31 #include "src/utils/common.h"
32 #include "src/utils/compiler_attributes.h"
33 #include "src/utils/logging.h"
34 
35 namespace libgav1 {
36 namespace dsp {
37 namespace film_grain {
38 namespace {
39 
40 // Load 8 values from source, widening to int16_t intermediate value size.
41 // The function is overloaded for each type and bitdepth for simplicity.
LoadSource(const int8_t * src)42 inline __m128i LoadSource(const int8_t* src) {
43   return _mm_cvtepi8_epi16(LoadLo8(src));
44 }
45 
46 // Load 8 values from source, widening to int16_t intermediate value size.
LoadSource(const uint8_t * src)47 inline __m128i LoadSource(const uint8_t* src) {
48   return _mm_cvtepu8_epi16(LoadLo8(src));
49 }
50 
LoadSourceMsan(const uint8_t * src,const int valid_range)51 inline __m128i LoadSourceMsan(const uint8_t* src, const int valid_range) {
52   return _mm_cvtepu8_epi16(LoadLo8Msan(src, 8 - valid_range));
53 }
54 
55 // Store 8 values to dest, narrowing to uint8_t from int16_t intermediate value.
StoreUnsigned(uint8_t * dest,const __m128i data)56 inline void StoreUnsigned(uint8_t* dest, const __m128i data) {
57   StoreLo8(dest, _mm_packus_epi16(data, data));
58 }
59 
60 #if LIBGAV1_MAX_BITDEPTH >= 10
61 // Load 8 values from source.
LoadSource(const int16_t * src)62 inline __m128i LoadSource(const int16_t* src) { return LoadUnaligned16(src); }
63 
64 // Load 8 values from source.
LoadSource(const uint16_t * src)65 inline __m128i LoadSource(const uint16_t* src) { return LoadUnaligned16(src); }
66 
67 // Store 8 values to dest.
StoreUnsigned(uint16_t * dest,const __m128i data)68 inline void StoreUnsigned(uint16_t* dest, const __m128i data) {
69   StoreUnaligned16(dest, data);
70 }
71 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
72 
73 // For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
GetAverageLuma(const uint8_t * const luma,int subsampling_x)74 inline __m128i GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
75   if (subsampling_x != 0) {
76     const __m128i src = LoadUnaligned16(luma);
77 
78     return RightShiftWithRounding_U16(
79         _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
80                        _mm_unpackhi_epi8(src, _mm_setzero_si128())),
81         1);
82   }
83   return _mm_cvtepu8_epi16(LoadLo8(luma));
84 }
85 
GetAverageLumaMsan(const uint8_t * const luma,int subsampling_x,int valid_range)86 inline __m128i GetAverageLumaMsan(const uint8_t* const luma, int subsampling_x,
87                                   int valid_range) {
88   if (subsampling_x != 0) {
89     const __m128i src = LoadUnaligned16Msan(luma, 16 - valid_range);
90 
91     return RightShiftWithRounding_U16(
92         _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
93                        _mm_unpackhi_epi8(src, _mm_setzero_si128())),
94         1);
95   }
96   return _mm_cvtepu8_epi16(LoadLo8Msan(luma, 8 - valid_range));
97 }
98 
99 #if LIBGAV1_MAX_BITDEPTH >= 10
100 // For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
GetAverageLuma(const uint16_t * const luma,int subsampling_x)101 inline __m128i GetAverageLuma(const uint16_t* const luma, int subsampling_x) {
102   if (subsampling_x != 0) {
103     return RightShiftWithRounding_U16(
104         _mm_hadd_epi16(LoadUnaligned16(luma), LoadUnaligned16(luma + 8)), 1);
105   }
106   return LoadUnaligned16(luma);
107 }
108 
GetAverageLumaMsan(const uint16_t * const luma,int subsampling_x,int valid_range)109 inline __m128i GetAverageLumaMsan(const uint16_t* const luma, int subsampling_x,
110                                   int valid_range) {
111   if (subsampling_x != 0) {
112     return RightShiftWithRounding_U16(
113         _mm_hadd_epi16(
114             LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma)),
115             LoadUnaligned16Msan(luma + 8, 32 - valid_range * sizeof(*luma))),
116         1);
117   }
118   return LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma));
119 }
120 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
121 
Clip3(const __m128i value,const __m128i low,const __m128i high)122 inline __m128i Clip3(const __m128i value, const __m128i low,
123                      const __m128i high) {
124   const __m128i clipped_to_ceiling = _mm_min_epi16(high, value);
125   return _mm_max_epi16(low, clipped_to_ceiling);
126 }
127 
128 template <int bitdepth, typename Pixel>
GetScalingFactors(const int16_t * scaling_lut,const Pixel * source)129 inline __m128i GetScalingFactors(const int16_t* scaling_lut,
130                                  const Pixel* source) {
131   alignas(16) int16_t start_vals[8];
132   static_assert(bitdepth <= kBitdepth10,
133                 "SSE4 Film Grain is not yet implemented for 12bpp.");
134   for (int i = 0; i < 8; ++i) {
135     assert(source[i] < kScalingLookupTableSize << (bitdepth - 2));
136     start_vals[i] = scaling_lut[source[i]];
137   }
138   return LoadAligned16(start_vals);
139 }
140 
141 // |scaling_shift| is in range [8,11].
142 template <int bitdepth>
ScaleNoise(const __m128i noise,const __m128i scaling,const __m128i scaling_shift)143 inline __m128i ScaleNoise(const __m128i noise, const __m128i scaling,
144                           const __m128i scaling_shift) {
145   const __m128i shifted_scale_factors = _mm_sll_epi16(scaling, scaling_shift);
146   return _mm_mulhrs_epi16(noise, shifted_scale_factors);
147 }
148 
149 template <int bitdepth, typename GrainType, typename Pixel>
BlendNoiseWithImageLuma_SSE4_1(const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_luma,int scaling_shift,int width,int height,int start_height,const int16_t * scaling_lut_y,const void * source_plane_y,ptrdiff_t source_stride_y,void * dest_plane_y,ptrdiff_t dest_stride_y)150 void BlendNoiseWithImageLuma_SSE4_1(
151     const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma,
152     int scaling_shift, int width, int height, int start_height,
153     const int16_t* scaling_lut_y, const void* source_plane_y,
154     ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y) {
155   const auto* noise_image =
156       static_cast<const Array2D<GrainType>*>(noise_image_ptr);
157   const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
158   source_stride_y /= sizeof(Pixel);
159   auto* out_y_row = static_cast<Pixel*>(dest_plane_y);
160   dest_stride_y /= sizeof(Pixel);
161   const __m128i floor = _mm_set1_epi16(min_value);
162   const __m128i ceiling = _mm_set1_epi16(max_luma);
163   const int safe_width = width & ~7;
164   const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
165   int y = 0;
166   do {
167     int x = 0;
168     for (; x < safe_width; x += 8) {
169       const __m128i orig = LoadSource(&in_y_row[x]);
170       const __m128i scaling =
171           GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
172       __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
173 
174       noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
175       const __m128i combined = _mm_add_epi16(orig, noise);
176       StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
177     }
178 
179     if (x < width) {
180       Pixel luma_buffer[8];
181       // Prevent arbitrary indices from entering GetScalingFactors.
182       memset(luma_buffer, 0, sizeof(luma_buffer));
183       const int valid_range = width - x;
184       memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0]));
185       luma_buffer[valid_range] = in_y_row[width - 1];
186       const __m128i orig = LoadSource(&in_y_row[x]);
187       const __m128i scaling =
188           GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, luma_buffer);
189       __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
190 
191       noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
192       const __m128i combined = _mm_add_epi16(orig, noise);
193       StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
194     }
195     in_y_row += source_stride_y;
196     out_y_row += dest_stride_y;
197   } while (++y < height);
198   out_y_row = static_cast<Pixel*>(dest_plane_y);
199 }
200 
201 template <int bitdepth, typename GrainType, typename Pixel>
BlendChromaValsWithCfl(const Pixel * LIBGAV1_RESTRICT average_luma_buffer,const int16_t * scaling_lut,const Pixel * LIBGAV1_RESTRICT chroma_cursor,const GrainType * LIBGAV1_RESTRICT noise_image_cursor,const __m128i scaling_shift)202 inline __m128i BlendChromaValsWithCfl(
203     const Pixel* LIBGAV1_RESTRICT average_luma_buffer,
204     const int16_t* scaling_lut, const Pixel* LIBGAV1_RESTRICT chroma_cursor,
205     const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
206     const __m128i scaling_shift) {
207   const __m128i scaling =
208       GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
209   const __m128i orig = LoadSource(chroma_cursor);
210   __m128i noise = LoadSource(noise_image_cursor);
211   noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift);
212   return _mm_add_epi16(orig, noise);
213 }
214 
215 template <int bitdepth, typename GrainType, typename Pixel>
BlendChromaPlaneWithCfl_SSE4_1(const Array2D<GrainType> & noise_image,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,int scaling_shift,const int16_t * scaling_lut,const Pixel * LIBGAV1_RESTRICT in_y_row,ptrdiff_t source_stride_y,const Pixel * in_chroma_row,ptrdiff_t source_stride_chroma,Pixel * out_chroma_row,ptrdiff_t dest_stride)216 LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
217     const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
218     int width, int height, int start_height, int subsampling_x,
219     int subsampling_y, int scaling_shift, const int16_t* scaling_lut,
220     const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
221     const Pixel* in_chroma_row, ptrdiff_t source_stride_chroma,
222     Pixel* out_chroma_row, ptrdiff_t dest_stride) {
223   const __m128i floor = _mm_set1_epi16(min_value);
224   const __m128i ceiling = _mm_set1_epi16(max_chroma);
225   alignas(16) Pixel luma_buffer[16];
226 
227   const int chroma_height = (height + subsampling_y) >> subsampling_y;
228   const int chroma_width = (width + subsampling_x) >> subsampling_x;
229   // |chroma_width| is rounded up. If |width| is odd, then the final pixel will
230   // need to be guarded from overread, even if |chroma_width| is divisible by 8.
231   const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
232 
233   // Writing to this buffer avoids the cost of doing 8 lane lookups in a row
234   // in GetScalingFactors.
235   Pixel average_luma_buffer[8];
236   assert(start_height % 2 == 0);
237   start_height >>= subsampling_y;
238   const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
239   int y = 0;
240   do {
241     int x = 0;
242     for (; x < safe_chroma_width; x += 8) {
243       const int luma_x = x << subsampling_x;
244       const __m128i average_luma =
245           GetAverageLuma(&in_y_row[luma_x], subsampling_x);
246       StoreUnsigned(average_luma_buffer, average_luma);
247 
248       const __m128i blended =
249           BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
250               average_luma_buffer, scaling_lut, &in_chroma_row[x],
251               &(noise_image[y + start_height][x]), derived_scaling_shift);
252       StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
253     }
254 
255     // This section only runs if width % (8 << sub_x) != 0. It should never run
256     // on 720p and above.
257     if (x < chroma_width) {
258       // Prevent huge indices from entering GetScalingFactors due to
259       // uninitialized values. This is not a problem in 8bpp because the table
260       // is made larger than 255 values.
261       if (bitdepth > kBitdepth8) {
262         memset(luma_buffer, 0, sizeof(luma_buffer));
263       }
264       const int luma_x = x << subsampling_x;
265       const int valid_range = width - luma_x;
266       assert(valid_range < 16);
267       memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
268       luma_buffer[valid_range] = in_y_row[width - 1];
269       const __m128i average_luma =
270           GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
271       StoreUnsigned(average_luma_buffer, average_luma);
272 
273       const __m128i blended =
274           BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
275               average_luma_buffer, scaling_lut, &in_chroma_row[x],
276               &(noise_image[y + start_height][x]), derived_scaling_shift);
277       StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
278     }
279 
280     in_y_row += source_stride_y << subsampling_y;
281     in_chroma_row += source_stride_chroma;
282     out_chroma_row += dest_stride;
283   } while (++y < chroma_height);
284 }
285 
286 // This function is for the case params_.chroma_scaling_from_luma == true.
287 // This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
288 template <int bitdepth, typename GrainType, typename Pixel>
BlendNoiseWithImageChromaWithCfl_SSE4_1(Plane plane,const FilmGrainParams & params,const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,const int16_t * scaling_lut,const void * LIBGAV1_RESTRICT source_plane_y,ptrdiff_t source_stride_y,const void * source_plane_uv,ptrdiff_t source_stride_uv,void * dest_plane_uv,ptrdiff_t dest_stride_uv)289 void BlendNoiseWithImageChromaWithCfl_SSE4_1(
290     Plane plane, const FilmGrainParams& params,
291     const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
292     int width, int height, int start_height, int subsampling_x,
293     int subsampling_y, const int16_t* scaling_lut,
294     const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
295     const void* source_plane_uv, ptrdiff_t source_stride_uv,
296     void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
297   const auto* noise_image =
298       static_cast<const Array2D<GrainType>*>(noise_image_ptr);
299   const auto* in_y = static_cast<const Pixel*>(source_plane_y);
300   source_stride_y /= sizeof(Pixel);
301 
302   const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
303   source_stride_uv /= sizeof(Pixel);
304   auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
305   dest_stride_uv /= sizeof(Pixel);
306   BlendChromaPlaneWithCfl_SSE4_1<bitdepth, GrainType, Pixel>(
307       noise_image[plane], min_value, max_chroma, width, height, start_height,
308       subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y,
309       source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv);
310 }
311 
312 }  // namespace
313 
314 namespace low_bitdepth {
315 namespace {
316 
317 // |offset| is 32x4 packed to add with the result of _mm_madd_epi16.
BlendChromaValsNoCfl8bpp(const int16_t * scaling_lut,const __m128i & orig,const int8_t * LIBGAV1_RESTRICT noise_image_cursor,const __m128i & average_luma,const __m128i & scaling_shift,const __m128i & offset,const __m128i & weights)318 inline __m128i BlendChromaValsNoCfl8bpp(
319     const int16_t* scaling_lut, const __m128i& orig,
320     const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
321     const __m128i& average_luma, const __m128i& scaling_shift,
322     const __m128i& offset, const __m128i& weights) {
323   uint8_t merged_buffer[8];
324   const __m128i combined_lo =
325       _mm_madd_epi16(_mm_unpacklo_epi16(average_luma, orig), weights);
326   const __m128i combined_hi =
327       _mm_madd_epi16(_mm_unpackhi_epi16(average_luma, orig), weights);
328   const __m128i merged_base = _mm_packs_epi32(_mm_srai_epi32((combined_lo), 6),
329                                               _mm_srai_epi32((combined_hi), 6));
330 
331   const __m128i merged = _mm_add_epi16(merged_base, offset);
332 
333   StoreLo8(merged_buffer, _mm_packus_epi16(merged, merged));
334   const __m128i scaling =
335       GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
336   __m128i noise = LoadSource(noise_image_cursor);
337   noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift);
338   return _mm_add_epi16(orig, noise);
339 }
340 
BlendChromaPlane8bpp_SSE4_1(const Array2D<int8_t> & noise_image,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,int scaling_shift,int chroma_offset,int chroma_multiplier,int luma_multiplier,const int16_t * scaling_lut,const uint8_t * LIBGAV1_RESTRICT in_y_row,ptrdiff_t source_stride_y,const uint8_t * in_chroma_row,ptrdiff_t source_stride_chroma,uint8_t * out_chroma_row,ptrdiff_t dest_stride)341 LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
342     const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
343     int width, int height, int start_height, int subsampling_x,
344     int subsampling_y, int scaling_shift, int chroma_offset,
345     int chroma_multiplier, int luma_multiplier, const int16_t* scaling_lut,
346     const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
347     const uint8_t* in_chroma_row, ptrdiff_t source_stride_chroma,
348     uint8_t* out_chroma_row, ptrdiff_t dest_stride) {
349   const __m128i floor = _mm_set1_epi16(min_value);
350   const __m128i ceiling = _mm_set1_epi16(max_chroma);
351 
352   const int chroma_height = (height + subsampling_y) >> subsampling_y;
353   const int chroma_width = (width + subsampling_x) >> subsampling_x;
354   // |chroma_width| is rounded up. If |width| is odd, then the final luma pixel
355   // will need to be guarded from overread, even if |chroma_width| is a
356   // multiple of 8.
357   const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
358   alignas(16) uint8_t luma_buffer[16];
359   const __m128i offset = _mm_set1_epi16(chroma_offset);
360   const __m128i multipliers = _mm_set1_epi32(LeftShift(chroma_multiplier, 16) |
361                                              (luma_multiplier & 0xFFFF));
362   const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
363 
364   start_height >>= subsampling_y;
365   int y = 0;
366   do {
367     int x = 0;
368     for (; x < safe_chroma_width; x += 8) {
369       const int luma_x = x << subsampling_x;
370       const __m128i average_luma =
371           GetAverageLuma(&in_y_row[luma_x], subsampling_x);
372       const __m128i orig_chroma = LoadSource(&in_chroma_row[x]);
373       const __m128i blended = BlendChromaValsNoCfl8bpp(
374           scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
375           average_luma, derived_scaling_shift, offset, multipliers);
376       StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
377     }
378 
379     if (x < chroma_width) {
380       // Begin right edge iteration. Same as the normal iterations, but the
381       // |average_luma| computation requires a duplicated luma value at the
382       // end.
383       const int luma_x = x << subsampling_x;
384       const int valid_range = width - luma_x;
385       assert(valid_range < 16);
386       // There is no need to pre-initialize this buffer, because merged values
387       // used as indices are saturated in the 8bpp case. Uninitialized values
388       // are written outside the frame.
389       memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
390       luma_buffer[valid_range] = in_y_row[width - 1];
391       const int valid_range_chroma = chroma_width - x;
392       uint8_t chroma_buffer[8];
393       memcpy(chroma_buffer, &in_chroma_row[x],
394              valid_range_chroma * sizeof(in_chroma_row[0]));
395 
396       const __m128i average_luma =
397           GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
398       const __m128i orig_chroma =
399           LoadSourceMsan(chroma_buffer, valid_range_chroma);
400       const __m128i blended = BlendChromaValsNoCfl8bpp(
401           scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
402           average_luma, derived_scaling_shift, offset, multipliers);
403       StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
404       // End of right edge iteration.
405     }
406 
407     in_y_row += source_stride_y << subsampling_y;
408     in_chroma_row += source_stride_chroma;
409     out_chroma_row += dest_stride;
410   } while (++y < chroma_height);
411 }
412 
413 // This function is for the case params_.chroma_scaling_from_luma == false.
BlendNoiseWithImageChroma8bpp_SSE4_1(Plane plane,const FilmGrainParams & params,const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,const int16_t * scaling_lut,const void * LIBGAV1_RESTRICT source_plane_y,ptrdiff_t source_stride_y,const void * source_plane_uv,ptrdiff_t source_stride_uv,void * dest_plane_uv,ptrdiff_t dest_stride_uv)414 void BlendNoiseWithImageChroma8bpp_SSE4_1(
415     Plane plane, const FilmGrainParams& params,
416     const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
417     int width, int height, int start_height, int subsampling_x,
418     int subsampling_y, const int16_t* scaling_lut,
419     const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
420     const void* source_plane_uv, ptrdiff_t source_stride_uv,
421     void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
422   assert(plane == kPlaneU || plane == kPlaneV);
423   const auto* noise_image =
424       static_cast<const Array2D<int8_t>*>(noise_image_ptr);
425   const auto* in_y = static_cast<const uint8_t*>(source_plane_y);
426   const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv);
427   auto* out_uv = static_cast<uint8_t*>(dest_plane_uv);
428 
429   const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
430   const int luma_multiplier =
431       (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
432   const int multiplier =
433       (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
434   BlendChromaPlane8bpp_SSE4_1(
435       noise_image[plane], min_value, max_chroma, width, height, start_height,
436       subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier,
437       luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv,
438       source_stride_uv, out_uv, dest_stride_uv);
439 }
440 
Init8bpp()441 void Init8bpp() {
442   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
443   assert(dsp != nullptr);
444 
445   dsp->film_grain.blend_noise_luma =
446       BlendNoiseWithImageLuma_SSE4_1<kBitdepth8, int8_t, uint8_t>;
447   dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_SSE4_1;
448   dsp->film_grain.blend_noise_chroma[1] =
449       BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth8, int8_t, uint8_t>;
450 }
451 
452 }  // namespace
453 }  // namespace low_bitdepth
454 
455 #if LIBGAV1_MAX_BITDEPTH >= 10
456 namespace high_bitdepth {
457 namespace {
458 
Init10bpp()459 void Init10bpp() {
460   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
461   assert(dsp != nullptr);
462 
463   dsp->film_grain.blend_noise_luma =
464       BlendNoiseWithImageLuma_SSE4_1<kBitdepth10, int16_t, uint16_t>;
465   dsp->film_grain.blend_noise_chroma[1] =
466       BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth10, int16_t, uint16_t>;
467 }
468 
469 }  // namespace
470 }  // namespace high_bitdepth
471 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
472 
473 }  // namespace film_grain
474 
FilmGrainInit_SSE4_1()475 void FilmGrainInit_SSE4_1() {
476   film_grain::low_bitdepth::Init8bpp();
477 #if LIBGAV1_MAX_BITDEPTH >= 10
478   film_grain::high_bitdepth::Init10bpp();
479 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
480 }
481 
482 }  // namespace dsp
483 }  // namespace libgav1
484 
485 #else   // !LIBGAV1_ENABLE_SSE4_1
486 
487 namespace libgav1 {
488 namespace dsp {
489 
FilmGrainInit_SSE4_1()490 void FilmGrainInit_SSE4_1() {}
491 
492 }  // namespace dsp
493 }  // namespace libgav1
494 #endif  // LIBGAV1_TARGETING_SSE4_1
495