1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/film_grain.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_TARGETING_SSE4_1
19 #include <smmintrin.h>
20
21 #include <cassert>
22 #include <cstddef>
23 #include <cstdint>
24 #include <cstring>
25
26 #include "src/dsp/common.h"
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/dsp/film_grain_common.h"
30 #include "src/dsp/x86/common_sse4.h"
31 #include "src/utils/common.h"
32 #include "src/utils/compiler_attributes.h"
33 #include "src/utils/logging.h"
34
35 namespace libgav1 {
36 namespace dsp {
37 namespace film_grain {
38 namespace {
39
40 // Load 8 values from source, widening to int16_t intermediate value size.
41 // The function is overloaded for each type and bitdepth for simplicity.
LoadSource(const int8_t * src)42 inline __m128i LoadSource(const int8_t* src) {
43 return _mm_cvtepi8_epi16(LoadLo8(src));
44 }
45
46 // Load 8 values from source, widening to int16_t intermediate value size.
LoadSource(const uint8_t * src)47 inline __m128i LoadSource(const uint8_t* src) {
48 return _mm_cvtepu8_epi16(LoadLo8(src));
49 }
50
LoadSourceMsan(const uint8_t * src,const int valid_range)51 inline __m128i LoadSourceMsan(const uint8_t* src, const int valid_range) {
52 return _mm_cvtepu8_epi16(LoadLo8Msan(src, 8 - valid_range));
53 }
54
55 // Store 8 values to dest, narrowing to uint8_t from int16_t intermediate value.
StoreUnsigned(uint8_t * dest,const __m128i data)56 inline void StoreUnsigned(uint8_t* dest, const __m128i data) {
57 StoreLo8(dest, _mm_packus_epi16(data, data));
58 }
59
60 #if LIBGAV1_MAX_BITDEPTH >= 10
61 // Load 8 values from source.
LoadSource(const int16_t * src)62 inline __m128i LoadSource(const int16_t* src) { return LoadUnaligned16(src); }
63
64 // Load 8 values from source.
LoadSource(const uint16_t * src)65 inline __m128i LoadSource(const uint16_t* src) { return LoadUnaligned16(src); }
66
67 // Store 8 values to dest.
StoreUnsigned(uint16_t * dest,const __m128i data)68 inline void StoreUnsigned(uint16_t* dest, const __m128i data) {
69 StoreUnaligned16(dest, data);
70 }
71 #endif // LIBGAV1_MAX_BITDEPTH >= 10
72
73 // For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
GetAverageLuma(const uint8_t * const luma,int subsampling_x)74 inline __m128i GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
75 if (subsampling_x != 0) {
76 const __m128i src = LoadUnaligned16(luma);
77
78 return RightShiftWithRounding_U16(
79 _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
80 _mm_unpackhi_epi8(src, _mm_setzero_si128())),
81 1);
82 }
83 return _mm_cvtepu8_epi16(LoadLo8(luma));
84 }
85
GetAverageLumaMsan(const uint8_t * const luma,int subsampling_x,int valid_range)86 inline __m128i GetAverageLumaMsan(const uint8_t* const luma, int subsampling_x,
87 int valid_range) {
88 if (subsampling_x != 0) {
89 const __m128i src = LoadUnaligned16Msan(luma, 16 - valid_range);
90
91 return RightShiftWithRounding_U16(
92 _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
93 _mm_unpackhi_epi8(src, _mm_setzero_si128())),
94 1);
95 }
96 return _mm_cvtepu8_epi16(LoadLo8Msan(luma, 8 - valid_range));
97 }
98
99 #if LIBGAV1_MAX_BITDEPTH >= 10
100 // For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
GetAverageLuma(const uint16_t * const luma,int subsampling_x)101 inline __m128i GetAverageLuma(const uint16_t* const luma, int subsampling_x) {
102 if (subsampling_x != 0) {
103 return RightShiftWithRounding_U16(
104 _mm_hadd_epi16(LoadUnaligned16(luma), LoadUnaligned16(luma + 8)), 1);
105 }
106 return LoadUnaligned16(luma);
107 }
108
GetAverageLumaMsan(const uint16_t * const luma,int subsampling_x,int valid_range)109 inline __m128i GetAverageLumaMsan(const uint16_t* const luma, int subsampling_x,
110 int valid_range) {
111 if (subsampling_x != 0) {
112 return RightShiftWithRounding_U16(
113 _mm_hadd_epi16(
114 LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma)),
115 LoadUnaligned16Msan(luma + 8, 32 - valid_range * sizeof(*luma))),
116 1);
117 }
118 return LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma));
119 }
120 #endif // LIBGAV1_MAX_BITDEPTH >= 10
121
Clip3(const __m128i value,const __m128i low,const __m128i high)122 inline __m128i Clip3(const __m128i value, const __m128i low,
123 const __m128i high) {
124 const __m128i clipped_to_ceiling = _mm_min_epi16(high, value);
125 return _mm_max_epi16(low, clipped_to_ceiling);
126 }
127
128 template <int bitdepth, typename Pixel>
GetScalingFactors(const int16_t * scaling_lut,const Pixel * source)129 inline __m128i GetScalingFactors(const int16_t* scaling_lut,
130 const Pixel* source) {
131 alignas(16) int16_t start_vals[8];
132 static_assert(bitdepth <= kBitdepth10,
133 "SSE4 Film Grain is not yet implemented for 12bpp.");
134 for (int i = 0; i < 8; ++i) {
135 assert(source[i] < kScalingLookupTableSize << (bitdepth - 2));
136 start_vals[i] = scaling_lut[source[i]];
137 }
138 return LoadAligned16(start_vals);
139 }
140
141 // |scaling_shift| is in range [8,11].
142 template <int bitdepth>
ScaleNoise(const __m128i noise,const __m128i scaling,const __m128i scaling_shift)143 inline __m128i ScaleNoise(const __m128i noise, const __m128i scaling,
144 const __m128i scaling_shift) {
145 const __m128i shifted_scale_factors = _mm_sll_epi16(scaling, scaling_shift);
146 return _mm_mulhrs_epi16(noise, shifted_scale_factors);
147 }
148
149 template <int bitdepth, typename GrainType, typename Pixel>
BlendNoiseWithImageLuma_SSE4_1(const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_luma,int scaling_shift,int width,int height,int start_height,const int16_t * scaling_lut_y,const void * source_plane_y,ptrdiff_t source_stride_y,void * dest_plane_y,ptrdiff_t dest_stride_y)150 void BlendNoiseWithImageLuma_SSE4_1(
151 const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma,
152 int scaling_shift, int width, int height, int start_height,
153 const int16_t* scaling_lut_y, const void* source_plane_y,
154 ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y) {
155 const auto* noise_image =
156 static_cast<const Array2D<GrainType>*>(noise_image_ptr);
157 const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
158 source_stride_y /= sizeof(Pixel);
159 auto* out_y_row = static_cast<Pixel*>(dest_plane_y);
160 dest_stride_y /= sizeof(Pixel);
161 const __m128i floor = _mm_set1_epi16(min_value);
162 const __m128i ceiling = _mm_set1_epi16(max_luma);
163 const int safe_width = width & ~7;
164 const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
165 int y = 0;
166 do {
167 int x = 0;
168 for (; x < safe_width; x += 8) {
169 const __m128i orig = LoadSource(&in_y_row[x]);
170 const __m128i scaling =
171 GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
172 __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
173
174 noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
175 const __m128i combined = _mm_add_epi16(orig, noise);
176 StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
177 }
178
179 if (x < width) {
180 Pixel luma_buffer[8];
181 // Prevent arbitrary indices from entering GetScalingFactors.
182 memset(luma_buffer, 0, sizeof(luma_buffer));
183 const int valid_range = width - x;
184 memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0]));
185 luma_buffer[valid_range] = in_y_row[width - 1];
186 const __m128i orig = LoadSource(&in_y_row[x]);
187 const __m128i scaling =
188 GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, luma_buffer);
189 __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
190
191 noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
192 const __m128i combined = _mm_add_epi16(orig, noise);
193 StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
194 }
195 in_y_row += source_stride_y;
196 out_y_row += dest_stride_y;
197 } while (++y < height);
198 out_y_row = static_cast<Pixel*>(dest_plane_y);
199 }
200
201 template <int bitdepth, typename GrainType, typename Pixel>
BlendChromaValsWithCfl(const Pixel * LIBGAV1_RESTRICT average_luma_buffer,const int16_t * scaling_lut,const Pixel * LIBGAV1_RESTRICT chroma_cursor,const GrainType * LIBGAV1_RESTRICT noise_image_cursor,const __m128i scaling_shift)202 inline __m128i BlendChromaValsWithCfl(
203 const Pixel* LIBGAV1_RESTRICT average_luma_buffer,
204 const int16_t* scaling_lut, const Pixel* LIBGAV1_RESTRICT chroma_cursor,
205 const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
206 const __m128i scaling_shift) {
207 const __m128i scaling =
208 GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
209 const __m128i orig = LoadSource(chroma_cursor);
210 __m128i noise = LoadSource(noise_image_cursor);
211 noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift);
212 return _mm_add_epi16(orig, noise);
213 }
214
215 template <int bitdepth, typename GrainType, typename Pixel>
BlendChromaPlaneWithCfl_SSE4_1(const Array2D<GrainType> & noise_image,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,int scaling_shift,const int16_t * scaling_lut,const Pixel * LIBGAV1_RESTRICT in_y_row,ptrdiff_t source_stride_y,const Pixel * in_chroma_row,ptrdiff_t source_stride_chroma,Pixel * out_chroma_row,ptrdiff_t dest_stride)216 LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
217 const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
218 int width, int height, int start_height, int subsampling_x,
219 int subsampling_y, int scaling_shift, const int16_t* scaling_lut,
220 const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
221 const Pixel* in_chroma_row, ptrdiff_t source_stride_chroma,
222 Pixel* out_chroma_row, ptrdiff_t dest_stride) {
223 const __m128i floor = _mm_set1_epi16(min_value);
224 const __m128i ceiling = _mm_set1_epi16(max_chroma);
225 alignas(16) Pixel luma_buffer[16];
226
227 const int chroma_height = (height + subsampling_y) >> subsampling_y;
228 const int chroma_width = (width + subsampling_x) >> subsampling_x;
229 // |chroma_width| is rounded up. If |width| is odd, then the final pixel will
230 // need to be guarded from overread, even if |chroma_width| is divisible by 8.
231 const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
232
233 // Writing to this buffer avoids the cost of doing 8 lane lookups in a row
234 // in GetScalingFactors.
235 Pixel average_luma_buffer[8];
236 assert(start_height % 2 == 0);
237 start_height >>= subsampling_y;
238 const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
239 int y = 0;
240 do {
241 int x = 0;
242 for (; x < safe_chroma_width; x += 8) {
243 const int luma_x = x << subsampling_x;
244 const __m128i average_luma =
245 GetAverageLuma(&in_y_row[luma_x], subsampling_x);
246 StoreUnsigned(average_luma_buffer, average_luma);
247
248 const __m128i blended =
249 BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
250 average_luma_buffer, scaling_lut, &in_chroma_row[x],
251 &(noise_image[y + start_height][x]), derived_scaling_shift);
252 StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
253 }
254
255 // This section only runs if width % (8 << sub_x) != 0. It should never run
256 // on 720p and above.
257 if (x < chroma_width) {
258 // Prevent huge indices from entering GetScalingFactors due to
259 // uninitialized values. This is not a problem in 8bpp because the table
260 // is made larger than 255 values.
261 if (bitdepth > kBitdepth8) {
262 memset(luma_buffer, 0, sizeof(luma_buffer));
263 }
264 const int luma_x = x << subsampling_x;
265 const int valid_range = width - luma_x;
266 assert(valid_range < 16);
267 memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
268 luma_buffer[valid_range] = in_y_row[width - 1];
269 const __m128i average_luma =
270 GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
271 StoreUnsigned(average_luma_buffer, average_luma);
272
273 const __m128i blended =
274 BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
275 average_luma_buffer, scaling_lut, &in_chroma_row[x],
276 &(noise_image[y + start_height][x]), derived_scaling_shift);
277 StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
278 }
279
280 in_y_row += source_stride_y << subsampling_y;
281 in_chroma_row += source_stride_chroma;
282 out_chroma_row += dest_stride;
283 } while (++y < chroma_height);
284 }
285
286 // This function is for the case params_.chroma_scaling_from_luma == true.
287 // This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
288 template <int bitdepth, typename GrainType, typename Pixel>
BlendNoiseWithImageChromaWithCfl_SSE4_1(Plane plane,const FilmGrainParams & params,const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,const int16_t * scaling_lut,const void * LIBGAV1_RESTRICT source_plane_y,ptrdiff_t source_stride_y,const void * source_plane_uv,ptrdiff_t source_stride_uv,void * dest_plane_uv,ptrdiff_t dest_stride_uv)289 void BlendNoiseWithImageChromaWithCfl_SSE4_1(
290 Plane plane, const FilmGrainParams& params,
291 const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
292 int width, int height, int start_height, int subsampling_x,
293 int subsampling_y, const int16_t* scaling_lut,
294 const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
295 const void* source_plane_uv, ptrdiff_t source_stride_uv,
296 void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
297 const auto* noise_image =
298 static_cast<const Array2D<GrainType>*>(noise_image_ptr);
299 const auto* in_y = static_cast<const Pixel*>(source_plane_y);
300 source_stride_y /= sizeof(Pixel);
301
302 const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
303 source_stride_uv /= sizeof(Pixel);
304 auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
305 dest_stride_uv /= sizeof(Pixel);
306 BlendChromaPlaneWithCfl_SSE4_1<bitdepth, GrainType, Pixel>(
307 noise_image[plane], min_value, max_chroma, width, height, start_height,
308 subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y,
309 source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv);
310 }
311
312 } // namespace
313
314 namespace low_bitdepth {
315 namespace {
316
317 // |offset| is 32x4 packed to add with the result of _mm_madd_epi16.
BlendChromaValsNoCfl8bpp(const int16_t * scaling_lut,const __m128i & orig,const int8_t * LIBGAV1_RESTRICT noise_image_cursor,const __m128i & average_luma,const __m128i & scaling_shift,const __m128i & offset,const __m128i & weights)318 inline __m128i BlendChromaValsNoCfl8bpp(
319 const int16_t* scaling_lut, const __m128i& orig,
320 const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
321 const __m128i& average_luma, const __m128i& scaling_shift,
322 const __m128i& offset, const __m128i& weights) {
323 uint8_t merged_buffer[8];
324 const __m128i combined_lo =
325 _mm_madd_epi16(_mm_unpacklo_epi16(average_luma, orig), weights);
326 const __m128i combined_hi =
327 _mm_madd_epi16(_mm_unpackhi_epi16(average_luma, orig), weights);
328 const __m128i merged_base = _mm_packs_epi32(_mm_srai_epi32((combined_lo), 6),
329 _mm_srai_epi32((combined_hi), 6));
330
331 const __m128i merged = _mm_add_epi16(merged_base, offset);
332
333 StoreLo8(merged_buffer, _mm_packus_epi16(merged, merged));
334 const __m128i scaling =
335 GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
336 __m128i noise = LoadSource(noise_image_cursor);
337 noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift);
338 return _mm_add_epi16(orig, noise);
339 }
340
BlendChromaPlane8bpp_SSE4_1(const Array2D<int8_t> & noise_image,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,int scaling_shift,int chroma_offset,int chroma_multiplier,int luma_multiplier,const int16_t * scaling_lut,const uint8_t * LIBGAV1_RESTRICT in_y_row,ptrdiff_t source_stride_y,const uint8_t * in_chroma_row,ptrdiff_t source_stride_chroma,uint8_t * out_chroma_row,ptrdiff_t dest_stride)341 LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
342 const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
343 int width, int height, int start_height, int subsampling_x,
344 int subsampling_y, int scaling_shift, int chroma_offset,
345 int chroma_multiplier, int luma_multiplier, const int16_t* scaling_lut,
346 const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
347 const uint8_t* in_chroma_row, ptrdiff_t source_stride_chroma,
348 uint8_t* out_chroma_row, ptrdiff_t dest_stride) {
349 const __m128i floor = _mm_set1_epi16(min_value);
350 const __m128i ceiling = _mm_set1_epi16(max_chroma);
351
352 const int chroma_height = (height + subsampling_y) >> subsampling_y;
353 const int chroma_width = (width + subsampling_x) >> subsampling_x;
354 // |chroma_width| is rounded up. If |width| is odd, then the final luma pixel
355 // will need to be guarded from overread, even if |chroma_width| is a
356 // multiple of 8.
357 const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
358 alignas(16) uint8_t luma_buffer[16];
359 const __m128i offset = _mm_set1_epi16(chroma_offset);
360 const __m128i multipliers = _mm_set1_epi32(LeftShift(chroma_multiplier, 16) |
361 (luma_multiplier & 0xFFFF));
362 const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
363
364 start_height >>= subsampling_y;
365 int y = 0;
366 do {
367 int x = 0;
368 for (; x < safe_chroma_width; x += 8) {
369 const int luma_x = x << subsampling_x;
370 const __m128i average_luma =
371 GetAverageLuma(&in_y_row[luma_x], subsampling_x);
372 const __m128i orig_chroma = LoadSource(&in_chroma_row[x]);
373 const __m128i blended = BlendChromaValsNoCfl8bpp(
374 scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
375 average_luma, derived_scaling_shift, offset, multipliers);
376 StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
377 }
378
379 if (x < chroma_width) {
380 // Begin right edge iteration. Same as the normal iterations, but the
381 // |average_luma| computation requires a duplicated luma value at the
382 // end.
383 const int luma_x = x << subsampling_x;
384 const int valid_range = width - luma_x;
385 assert(valid_range < 16);
386 // There is no need to pre-initialize this buffer, because merged values
387 // used as indices are saturated in the 8bpp case. Uninitialized values
388 // are written outside the frame.
389 memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
390 luma_buffer[valid_range] = in_y_row[width - 1];
391 const int valid_range_chroma = chroma_width - x;
392 uint8_t chroma_buffer[8];
393 memcpy(chroma_buffer, &in_chroma_row[x],
394 valid_range_chroma * sizeof(in_chroma_row[0]));
395
396 const __m128i average_luma =
397 GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
398 const __m128i orig_chroma =
399 LoadSourceMsan(chroma_buffer, valid_range_chroma);
400 const __m128i blended = BlendChromaValsNoCfl8bpp(
401 scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
402 average_luma, derived_scaling_shift, offset, multipliers);
403 StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
404 // End of right edge iteration.
405 }
406
407 in_y_row += source_stride_y << subsampling_y;
408 in_chroma_row += source_stride_chroma;
409 out_chroma_row += dest_stride;
410 } while (++y < chroma_height);
411 }
412
413 // This function is for the case params_.chroma_scaling_from_luma == false.
BlendNoiseWithImageChroma8bpp_SSE4_1(Plane plane,const FilmGrainParams & params,const void * LIBGAV1_RESTRICT noise_image_ptr,int min_value,int max_chroma,int width,int height,int start_height,int subsampling_x,int subsampling_y,const int16_t * scaling_lut,const void * LIBGAV1_RESTRICT source_plane_y,ptrdiff_t source_stride_y,const void * source_plane_uv,ptrdiff_t source_stride_uv,void * dest_plane_uv,ptrdiff_t dest_stride_uv)414 void BlendNoiseWithImageChroma8bpp_SSE4_1(
415 Plane plane, const FilmGrainParams& params,
416 const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
417 int width, int height, int start_height, int subsampling_x,
418 int subsampling_y, const int16_t* scaling_lut,
419 const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
420 const void* source_plane_uv, ptrdiff_t source_stride_uv,
421 void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
422 assert(plane == kPlaneU || plane == kPlaneV);
423 const auto* noise_image =
424 static_cast<const Array2D<int8_t>*>(noise_image_ptr);
425 const auto* in_y = static_cast<const uint8_t*>(source_plane_y);
426 const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv);
427 auto* out_uv = static_cast<uint8_t*>(dest_plane_uv);
428
429 const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
430 const int luma_multiplier =
431 (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
432 const int multiplier =
433 (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
434 BlendChromaPlane8bpp_SSE4_1(
435 noise_image[plane], min_value, max_chroma, width, height, start_height,
436 subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier,
437 luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv,
438 source_stride_uv, out_uv, dest_stride_uv);
439 }
440
Init8bpp()441 void Init8bpp() {
442 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
443 assert(dsp != nullptr);
444
445 dsp->film_grain.blend_noise_luma =
446 BlendNoiseWithImageLuma_SSE4_1<kBitdepth8, int8_t, uint8_t>;
447 dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_SSE4_1;
448 dsp->film_grain.blend_noise_chroma[1] =
449 BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth8, int8_t, uint8_t>;
450 }
451
452 } // namespace
453 } // namespace low_bitdepth
454
455 #if LIBGAV1_MAX_BITDEPTH >= 10
456 namespace high_bitdepth {
457 namespace {
458
Init10bpp()459 void Init10bpp() {
460 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
461 assert(dsp != nullptr);
462
463 dsp->film_grain.blend_noise_luma =
464 BlendNoiseWithImageLuma_SSE4_1<kBitdepth10, int16_t, uint16_t>;
465 dsp->film_grain.blend_noise_chroma[1] =
466 BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth10, int16_t, uint16_t>;
467 }
468
469 } // namespace
470 } // namespace high_bitdepth
471 #endif // LIBGAV1_MAX_BITDEPTH >= 10
472
473 } // namespace film_grain
474
FilmGrainInit_SSE4_1()475 void FilmGrainInit_SSE4_1() {
476 film_grain::low_bitdepth::Init8bpp();
477 #if LIBGAV1_MAX_BITDEPTH >= 10
478 film_grain::high_bitdepth::Init10bpp();
479 #endif // LIBGAV1_MAX_BITDEPTH >= 10
480 }
481
482 } // namespace dsp
483 } // namespace libgav1
484
485 #else // !LIBGAV1_ENABLE_SSE4_1
486
487 namespace libgav1 {
488 namespace dsp {
489
FilmGrainInit_SSE4_1()490 void FilmGrainInit_SSE4_1() {}
491
492 } // namespace dsp
493 } // namespace libgav1
494 #endif // LIBGAV1_TARGETING_SSE4_1
495