1 /*****************************************************************************
2
3 TransLut.cpp
4 Author: Laurent de Soras, 2015
5
6 To do:
7 - Remove code for destination bitdepth < 16
8
9 --- Legal stuff ---
10
11 This program is free software. It comes without any warranty, to
12 the extent permitted by applicable law. You can redistribute it
13 and/or modify it under the terms of the Do What The Fuck You Want
14 To Public License, Version 2, as published by Sam Hocevar. See
15 http://sam.zoy.org/wtfpl/COPYING for more details.
16
17 *Tab=3***********************************************************************/
18
19
20
21 #if defined (_MSC_VER)
22 #pragma warning (1 : 4130 4223 4705 4706)
23 #pragma warning (4 : 4355 4786 4800)
24 #endif
25
26
27
28 /*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
29
30 #include "fstb/def.h"
31
32 #include "fmtcl/Cst.h"
33 #include "fmtcl/TransLut.h"
34 #include "fmtcl/TransOpInterface.h"
35 #include "fstb/fnc.h"
36
37 #if (fstb_ARCHI == fstb_ARCHI_X86)
38 #include "fstb/ToolsSse2.h"
39 #endif
40
41 #include <algorithm>
42
43 #include <cassert>
44 #include <cmath>
45 #include <cstdlib>
46
47
48
49 namespace fmtcl
50 {
51
52
53
54 #if (fstb_ARCHI == fstb_ARCHI_X86)
55
56
57
58 template <class M>
59 class TransLut_FindIndexSse2
60 {
61 public:
62 static const int LINLUT_RES_L2 = TransLut::LINLUT_RES_L2;
63 static const int LINLUT_MIN_F = TransLut::LINLUT_MIN_F;
64 static const int LINLUT_MAX_F = TransLut::LINLUT_MAX_F;
65 static const int LINLUT_SIZE_F = TransLut::LINLUT_SIZE_F;
66
67 static const int LOGLUT_MIN_L2 = TransLut::LOGLUT_MIN_L2;
68 static const int LOGLUT_MAX_L2 = TransLut::LOGLUT_MAX_L2;
69 static const int LOGLUT_RES_L2 = TransLut::LOGLUT_RES_L2;
70 static const int LOGLUT_HSIZE = TransLut::LOGLUT_HSIZE;
71 static const int LOGLUT_SIZE = TransLut::LOGLUT_SIZE;
72
73 static inline void
74 find_index (const TransLut::FloatIntMix val_arr [4], __m128i &index, __m128 &frac) noexcept;
75 };
76
77
78
79 template <>
find_index(const TransLut::FloatIntMix val_arr[4],__m128i & index,__m128 & frac)80 void TransLut_FindIndexSse2 <TransLut::MapperLin>::find_index (const TransLut::FloatIntMix val_arr [4], __m128i &index, __m128 &frac) noexcept
81 {
82 assert (val_arr != nullptr);
83
84 constexpr int offset = -LINLUT_MIN_F * (1 << LINLUT_RES_L2);
85 const __m128 scale = _mm_set1_ps (1 << LINLUT_RES_L2);
86 const __m128i offset_ps = _mm_set1_epi32 (offset);
87 const __m128 val_min = _mm_set1_ps (0 - offset);
88 const __m128 val_max = _mm_set1_ps (LINLUT_SIZE_F - 2 - offset);
89
90 const __m128 v =
91 _mm_load_ps (reinterpret_cast <const float *> (val_arr));
92 __m128 val_scl = _mm_mul_ps (v, scale);
93 val_scl = _mm_min_ps (val_scl, val_max);
94 val_scl = _mm_max_ps (val_scl, val_min);
95 const __m128i index_raw = _mm_cvtps_epi32 (val_scl);
96 index = _mm_add_epi32 (index_raw, offset_ps);
97 frac = _mm_sub_ps (val_scl, _mm_cvtepi32_ps (index_raw));
98 }
99
100
101
102 template <>
find_index(const TransLut::FloatIntMix val_arr[4],__m128i & index,__m128 & frac)103 void TransLut_FindIndexSse2 <TransLut::MapperLog>::find_index (const TransLut::FloatIntMix val_arr [4], __m128i &index, __m128 &frac) noexcept
104 {
105 assert (val_arr != nullptr);
106
107 // Constants
108 constexpr int mant_size = 23;
109 constexpr int exp_bias = 127;
110 constexpr uint32_t base = (exp_bias + LOGLUT_MIN_L2) << mant_size;
111 constexpr float val_min = 1.0f / (int64_t (1) << -LOGLUT_MIN_L2);
112 // constexpr float val_max = float (int64_t (1) << LOGLUT_MAX_L2);
113 constexpr int frac_size = mant_size - LOGLUT_RES_L2;
114 constexpr uint32_t frac_mask = (1 << frac_size) - 1;
115
116 const __m128 zero_f = _mm_setzero_ps ();
117 const __m128 one_f = _mm_set1_ps (1);
118 const __m128 frac_mul = _mm_set1_ps (1.0f / (1 << frac_size));
119 const __m128 mul_eps = _mm_set1_ps (1.0f / val_min);
120 const __m128 mask_abs_f = _mm_load_ps (
121 reinterpret_cast <const float *> (fstb::ToolsSse2::_mask_abs)
122 );
123
124 const __m128i zero_i = _mm_setzero_si128 ();
125 const __m128i mask_abs_epi32 = _mm_set1_epi32 (0x7FFFFFFF);
126 const __m128i one_epi32 = _mm_set1_epi32 (1);
127 const __m128i base_epi32 = _mm_set1_epi32 (int (base));
128 const __m128i frac_mask_epi32 = _mm_set1_epi32 (frac_mask);
129 const __m128i val_min_epi32 =
130 _mm_set1_epi32 ((LOGLUT_MIN_L2 + exp_bias) << mant_size);
131 const __m128i val_max_epi32 =
132 _mm_set1_epi32 ((LOGLUT_MAX_L2 + exp_bias) << mant_size);
133 const __m128i index_max_epi32 =
134 _mm_set1_epi32 ((LOGLUT_MAX_L2 - LOGLUT_MIN_L2) << LOGLUT_RES_L2);
135 const __m128i hsize_epi32 = _mm_set1_epi32 (LOGLUT_HSIZE);
136 const __m128i mirror_epi32 = _mm_set1_epi32 (LOGLUT_HSIZE - 1);
137
138 // It really starts here
139 const __m128 val_f = _mm_load_ps (reinterpret_cast <const float *> (val_arr));
140 const __m128 val_a = _mm_and_ps (val_f, mask_abs_f);
141 const __m128i val_i = _mm_load_si128 (reinterpret_cast <const __m128i *> (val_arr));
142 const __m128i val_u = _mm_and_si128 (val_i, mask_abs_epi32);
143
144 // Standard path
145 __m128i index_std = _mm_sub_epi32 (val_u, base_epi32);
146 index_std = _mm_srli_epi32 (index_std, frac_size);
147 index_std = _mm_add_epi32 (index_std, one_epi32);
148 __m128i frac_stdi = _mm_and_si128 (val_u, frac_mask_epi32);
149 __m128 frac_std = _mm_cvtepi32_ps (frac_stdi);
150 frac_std = _mm_mul_ps (frac_std, frac_mul);
151
152 // Epsilon path
153 __m128 frac_eps = _mm_max_ps (val_a, zero_f);
154 frac_eps = _mm_mul_ps (frac_eps, mul_eps);
155
156 // Range cases
157 const __m128i eps_flag_i = _mm_cmpgt_epi32 (val_min_epi32, val_u);
158 const __m128i std_flag_i = _mm_cmpgt_epi32 (val_max_epi32, val_u);
159 const __m128 eps_flag_f = _mm_castsi128_ps (eps_flag_i);
160 const __m128 std_flag_f = _mm_castsi128_ps (std_flag_i);
161 __m128i index_tmp =
162 fstb::ToolsSse2::select (std_flag_i, index_std, index_max_epi32);
163 __m128 frac_tmp =
164 fstb::ToolsSse2::select (std_flag_f, frac_std, one_f);
165 index_tmp = fstb::ToolsSse2::select (eps_flag_i, zero_i, index_tmp);
166 frac_tmp = fstb::ToolsSse2::select (eps_flag_f, frac_eps, frac_tmp);
167
168 // Sign cases
169 const __m128i neg_flag_i = _mm_srai_epi32 (val_i, 31);
170 const __m128 neg_flag_f = _mm_castsi128_ps (neg_flag_i);
171 const __m128i index_neg = _mm_sub_epi32 (mirror_epi32, index_tmp);
172 const __m128i index_pos = _mm_add_epi32 (hsize_epi32, index_tmp);
173 const __m128 frac_neg = _mm_sub_ps (one_f, frac_tmp);
174 index = fstb::ToolsSse2::select (neg_flag_i, index_neg, index_pos);
175 frac = fstb::ToolsSse2::select (neg_flag_f, frac_neg, frac_tmp);
176 }
177
178
179
180 template <class T>
TransLut_store_sse2(T * dst_ptr,__m128 val)181 static fstb_FORCEINLINE void TransLut_store_sse2 (T *dst_ptr, __m128 val) noexcept
182 {
183 _mm_store_si128 (
184 reinterpret_cast <__m128i *> (dst_ptr),
185 _mm_cvtps_epi32 (val)
186 );
187 }
188
TransLut_store_sse2(float * dst_ptr,__m128 val)189 static fstb_FORCEINLINE void TransLut_store_sse2 (float *dst_ptr, __m128 val) noexcept
190 {
191 _mm_store_ps (dst_ptr, val);
192 }
193
194
195
196 #endif // fstb_ARCHI_X86
197
198
199
200 /*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
201
202
203
204 constexpr int TransLut::LINLUT_RES_L2;
205 constexpr int TransLut::LINLUT_MIN_F;
206 constexpr int TransLut::LINLUT_MAX_F;
207 constexpr int TransLut::LINLUT_SIZE_F;
208 constexpr int TransLut::LOGLUT_MIN_L2;
209 constexpr int TransLut::LOGLUT_MAX_L2;
210 constexpr int TransLut::LOGLUT_RES_L2;
211 constexpr int TransLut::LOGLUT_HSIZE;
212 constexpr int TransLut::LOGLUT_SIZE;
213
214
215
TransLut(const TransOpInterface & curve,bool log_flag,SplFmt src_fmt,int src_bits,bool src_full_flag,SplFmt dst_fmt,int dst_bits,bool dst_full_flag,bool sse2_flag,bool avx2_flag)216 TransLut::TransLut (const TransOpInterface &curve, bool log_flag, SplFmt src_fmt, int src_bits, bool src_full_flag, SplFmt dst_fmt, int dst_bits, bool dst_full_flag, bool sse2_flag, bool avx2_flag)
217 : _loglut_flag (log_flag)
218 , _src_fmt (src_fmt)
219 , _src_bits (src_bits)
220 , _src_full_flag (src_full_flag)
221 , _dst_fmt (dst_fmt)
222 , _dst_bits (dst_bits)
223 , _dst_full_flag (dst_full_flag)
224 , _sse2_flag (sse2_flag)
225 , _avx2_flag (avx2_flag)
226 {
227 assert (src_fmt >= 0);
228 assert (src_fmt < SplFmt_NBR_ELT);
229 assert (src_bits >= 8);
230 assert (dst_fmt >= 0);
231 assert (dst_fmt < SplFmt_NBR_ELT);
232 assert (dst_bits >= 8);
233
234 generate_lut (curve);
235 init_proc_fnc ();
236 }
237
238
239
process_plane(const Plane<> & dst,const PlaneRO<> & src,int w,int h) const240 void TransLut::process_plane (const Plane <> &dst, const PlaneRO <> &src, int w, int h) const noexcept
241 {
242 assert (dst.is_valid (h));
243 assert (src.is_valid (h));
244 assert (w > 0);
245 assert (h > 0);
246
247 assert (_process_plane_ptr != nullptr);
248 (this->*_process_plane_ptr) (dst, src, w, h);
249 }
250
251
252
MapperLin(int lut_size,double range_beg,double range_lst)253 TransLut::MapperLin::MapperLin (int lut_size, double range_beg, double range_lst) noexcept
254 : _lut_size (lut_size)
255 , _range_beg (range_beg)
256 , _step ((range_lst - range_beg) / (lut_size - 1))
257 {
258 assert (lut_size >= 2);
259 assert (range_beg < range_lst);
260 }
261
262
263
find_index(const FloatIntMix & val,int & index,float & frac)264 void TransLut::MapperLin::find_index (const FloatIntMix &val, int &index, float &frac) noexcept
265 {
266 const float val_scl = val._f * (1 << LINLUT_RES_L2);
267 const int index_raw = fstb::floor_int (val_scl);
268 constexpr int offset = -LINLUT_MIN_F * (1 << LINLUT_RES_L2);
269 index = fstb::limit (index_raw + offset, 0, LINLUT_SIZE_F - 2);
270 frac = val_scl - float (index_raw);
271 }
272
273
274
find_val(int index) const275 double TransLut::MapperLin::find_val (int index) const noexcept
276 {
277 return _range_beg + index * _step;
278 }
279
280
281
find_index(const FloatIntMix & val,int & index,float & frac)282 void TransLut::MapperLog::find_index (const FloatIntMix &val, int &index, float &frac) noexcept
283 {
284 static_assert (LOGLUT_MIN_L2 <= 0, "LOGLUT_MIN_L2 must be negative");
285 static_assert (LOGLUT_MAX_L2 >= 0, "LOGLUT_MAX_L2 must be positive");
286
287 constexpr int mant_size = 23;
288 constexpr int exp_bias = 127;
289 constexpr uint32_t base = (exp_bias + LOGLUT_MIN_L2) << mant_size;
290 constexpr float val_min = 1.0f / (int64_t (1) << -LOGLUT_MIN_L2);
291 constexpr float val_max = float (int64_t (1) << LOGLUT_MAX_L2);
292 constexpr int frac_size = mant_size - LOGLUT_RES_L2;
293 constexpr uint32_t frac_mask = (1 << frac_size) - 1;
294
295 const uint32_t val_u = val._i & 0x7FFFFFFF;
296 const float val_a = fabsf (val._f);
297
298 // index is set relatively to the x=0 index...
299 if (val_a < val_min)
300 {
301 index = 0;
302 frac = std::max (val_a, 0.0f) * (1.0f / val_min);
303 }
304 else if (val_a >= val_max)
305 {
306 index = ((LOGLUT_MAX_L2 - LOGLUT_MIN_L2) << LOGLUT_RES_L2);
307 frac = 1;
308 }
309 else
310 {
311 index = ((val_u - base) >> frac_size) + 1;
312 frac = float (val_u & frac_mask) * (1.0f / (1 << frac_size));
313 }
314
315 // ...and shifted or mirrored depending on the sign
316 if (val._f >= 0)
317 {
318 index += LOGLUT_HSIZE;
319 }
320 else
321 {
322 // Because frac cannot be negative, step one index behind.
323 index = LOGLUT_HSIZE - 1 - index;
324 frac = 1 - frac;
325 }
326
327 assert (index >= 0);
328 assert (index < LOGLUT_SIZE - 1);
329 assert (frac >= 0);
330 assert (frac <= 1);
331 }
332
333
334
find_val(int index) const335 double TransLut::MapperLog::find_val (int index) const noexcept
336 {
337 assert (index >= 0);
338 assert (index < LOGLUT_SIZE);
339
340 static constexpr float val_min = 1.0f / (int64_t (1) << -LOGLUT_MIN_L2);
341 static constexpr int seg_size = 1 << LOGLUT_RES_L2;
342
343 // float is OK because the values are exactly represented in float.
344 float val = 0;
345 int ind_2 = index - LOGLUT_HSIZE;
346 if (ind_2 != 0)
347 {
348 const int ind_3 = std::abs (ind_2) - 1;
349 const int log2_part = ind_3 >> LOGLUT_RES_L2;
350 const int seg_part = ind_3 & (seg_size - 1);
351 const float lerp = float (seg_part) * (1.0f / seg_size);
352 const float v0 = float (int64_t (1) << log2_part) * val_min;
353 val = v0 * (1 + lerp);
354 if (ind_2 < 0)
355 {
356 val = -val;
357 }
358 }
359
360 return val;
361 }
362
363
364
365 // For float input. Only checks the curvature, not the extended range
is_loglut_req(const TransOpInterface & curve)366 bool TransLut::is_loglut_req (const TransOpInterface &curve)
367 {
368 // Delta to compute the slope
369 constexpr double delta = 1.0 / 65536;
370
371 // Slope at 1, for reference
372 // Curve may be clipping early because of contrast increase, so we
373 // try smaller values
374 double x1 = 1;
375 double s1 = 0;
376 do
377 {
378 const double v1 = curve (x1);
379 const double v1d = curve (x1 - delta);
380 s1 = (v1 - v1d) / delta;
381 x1 *= 0.5;
382 }
383 while (s1 <= 0 && x1 >= 0.01);
384 // At this point s1 may still be 0, we will ignore the result.
385
386 // Slope at 0
387 const double v0 = curve (0);
388 const double v0d = curve (0 + delta);
389 const double s0 = (v0d - v0) / delta;
390 assert (s0 > 0);
391
392 // Arbitrary factor, seems to work decently
393 if (s1 > 0 && s0 >= 50 * s1)
394 {
395 return true;
396 }
397
398 // Slope close to 0
399 constexpr double xs = 1.0 / 4096;
400 const double vsn = curve (xs - delta * 0.5);
401 const double vsp = curve (xs + delta * 0.5);
402 const double ss = (vsp - vsn) / delta;
403 assert (ss > 0);
404
405 if (s0 >= 3 * ss)
406 {
407 return true;
408 }
409
410 return false;
411 }
412
413
414
415 /*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
416
417
418
419 /*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
420
421
422
generate_lut(const TransOpInterface & curve)423 void TransLut::generate_lut (const TransOpInterface &curve)
424 {
425 if (_src_fmt == SplFmt_FLOAT)
426 {
427 // When the source is float, the LUT output is always float
428 // so we can interpolate it easily and obtain the exact values.
429 // If the target data type is int, we quantize the interpolated
430 // values as a second step.
431 _lut.set_type <float> ();
432
433 if (_loglut_flag)
434 {
435 _lut.resize (LOGLUT_SIZE);
436 MapperLog mapper;
437 generate_lut_flt <float> (curve, mapper);
438 }
439 else
440 {
441 _lut.resize (LINLUT_SIZE_F);
442 MapperLin mapper (LINLUT_SIZE_F, LINLUT_MIN_F, LINLUT_MAX_F);
443 generate_lut_flt <float> (curve, mapper);
444 }
445 }
446
447 else
448 {
449 _loglut_flag = false;
450
451 int range = 1 << _src_bits;
452 if (_src_fmt == SplFmt_INT8)
453 {
454 _lut.resize (1 << 8);
455 }
456 else
457 {
458 _lut.resize (1 << 16);
459 }
460 const int sb16 = (_src_full_flag) ? 0 : Cst::_rtv_lum_blk << 8;
461 const int sw16 = (_src_full_flag) ? 0xFFFF : Cst::_rtv_lum_wht << 8;
462 int sbn = sb16 >> (16 - _src_bits);
463 int swn = sw16 >> (16 - _src_bits);
464 const int sdif = swn - sbn;
465 const double r_beg = double (0 - sbn) / sdif;
466 const double r_lst = double (range - 1 - sbn) / sdif;
467 if (_dst_fmt == SplFmt_FLOAT)
468 {
469 _lut.set_type <float> ();
470 MapperLin mapper (range, r_beg, r_lst);
471 generate_lut_flt <float> (curve, mapper);
472 }
473 else
474 {
475 const int db16 = (_dst_full_flag) ? 0 : Cst::_rtv_lum_blk << 8;
476 const int dw16 = (_dst_full_flag) ? 0xFFFF : Cst::_rtv_lum_wht << 8;
477 int dbn = db16 >> (16 - _dst_bits);
478 int dwn = dw16 >> (16 - _dst_bits);
479 const double mul = dwn - dbn;
480 const double add = dbn;
481 if (_dst_bits > 8)
482 {
483 _lut.set_type <uint16_t> ();
484 generate_lut_int <uint16_t> (
485 curve, range, r_beg, r_lst, mul, add
486 );
487 }
488 else
489 {
490 _lut.set_type <uint8_t> ();
491 generate_lut_int <uint8_t> (
492 curve, range, r_beg, r_lst, mul, add
493 );
494 }
495 }
496 }
497 }
498
499
500
501 // T = LUT data type (int or float)
502 template <class T>
generate_lut_int(const TransOpInterface & curve,int lut_size,double range_beg,double range_lst,double mul,double add)503 void TransLut::generate_lut_int (const TransOpInterface &curve, int lut_size, double range_beg, double range_lst, double mul, double add)
504 {
505 assert (_dst_fmt != SplFmt_FLOAT);
506 assert (lut_size > 1);
507 assert (range_beg < range_lst);
508
509 const double scale = (range_lst - range_beg) / (lut_size - 1);
510 const int max_val = (1 << _dst_bits) - 1;
511 for (int pos = 0; pos < lut_size; ++pos)
512 {
513 const double x = range_beg + pos * scale;
514 const double y = curve (x) * mul + add;
515 _lut.use <T> (pos) = T (fstb::limit (fstb::round_int (y), 0, max_val));
516 }
517 }
518
519
520
521 // T = float
522 template <class T, class M>
generate_lut_flt(const TransOpInterface & curve,const M & mapper)523 void TransLut::generate_lut_flt (const TransOpInterface &curve, const M &mapper)
524 {
525 const int lut_size = mapper.get_lut_size ();
526 for (int pos = 0; pos < lut_size; ++pos)
527 {
528 const double x = mapper.find_val (pos);
529 const double y = curve (x);
530 _lut.use <T> (pos) = T (y);
531 }
532 }
533
534
535
init_proc_fnc()536 void TransLut::init_proc_fnc ()
537 {
538 assert (! _loglut_flag || _src_fmt == SplFmt_FLOAT);
539
540 const int s =
541 (_loglut_flag ) ? 0
542 : (_src_fmt == SplFmt_FLOAT) ? 1
543 : (_src_bits > 8 ) ? 2
544 : 3;
545 const int d =
546 (_dst_fmt == SplFmt_FLOAT) ? 0
547 : (_dst_bits > 8 ) ? 1
548 : 2;
549
550 const int selector = d * 4 + s;
551
552 switch (selector)
553 {
554 case 0*4+0: _process_plane_ptr = &ThisType::process_plane_flt_any_cpp < float , MapperLog>; break;
555 case 0*4+1: _process_plane_ptr = &ThisType::process_plane_flt_any_cpp < float , MapperLin>; break;
556 case 0*4+2: _process_plane_ptr = &ThisType::process_plane_int_any_cpp <uint16_t, float >; break;
557 case 0*4+3: _process_plane_ptr = &ThisType::process_plane_int_any_cpp <uint8_t , float >; break;
558 case 1*4+0: _process_plane_ptr = &ThisType::process_plane_flt_any_cpp < uint16_t, MapperLog>; break;
559 case 1*4+1: _process_plane_ptr = &ThisType::process_plane_flt_any_cpp < uint16_t, MapperLin>; break;
560 case 1*4+2: _process_plane_ptr = &ThisType::process_plane_int_any_cpp <uint16_t, uint16_t >; break;
561 case 1*4+3: _process_plane_ptr = &ThisType::process_plane_int_any_cpp <uint8_t , uint16_t >; break;
562 case 2*4+0: _process_plane_ptr = &ThisType::process_plane_flt_any_cpp < uint8_t , MapperLog>; break;
563 case 2*4+1: _process_plane_ptr = &ThisType::process_plane_flt_any_cpp < uint8_t , MapperLin>; break;
564 case 2*4+2: _process_plane_ptr = &ThisType::process_plane_int_any_cpp <uint16_t, uint8_t >; break;
565 case 2*4+3: _process_plane_ptr = &ThisType::process_plane_int_any_cpp <uint8_t , uint8_t >; break;
566
567 default:
568 assert (false);
569 break;
570 }
571 #if (fstb_ARCHI == fstb_ARCHI_X86)
572 init_proc_fnc_sse2 (selector);
573 init_proc_fnc_avx2 (selector);
574 #endif
575 }
576
577
578
579 #if (fstb_ARCHI == fstb_ARCHI_X86)
580
init_proc_fnc_sse2(int selector)581 void TransLut::init_proc_fnc_sse2 (int selector)
582 {
583 if (_sse2_flag && _src_fmt == SplFmt_FLOAT)
584 {
585 switch (selector)
586 {
587 case 0*4+0: _process_plane_ptr = &ThisType::process_plane_flt_any_sse2 <float , MapperLog>; break;
588 case 0*4+1: _process_plane_ptr = &ThisType::process_plane_flt_any_sse2 <float , MapperLin>; break;
589 case 1*4+0: _process_plane_ptr = &ThisType::process_plane_flt_any_sse2 <uint16_t, MapperLog>; break;
590 case 1*4+1: _process_plane_ptr = &ThisType::process_plane_flt_any_sse2 <uint16_t, MapperLin>; break;
591 case 2*4+0: _process_plane_ptr = &ThisType::process_plane_flt_any_sse2 <uint8_t , MapperLog>; break;
592 case 2*4+1: _process_plane_ptr = &ThisType::process_plane_flt_any_sse2 <uint8_t , MapperLin>; break;
593
594 default:
595 // Nothing
596 break;
597 }
598 }
599 }
600
601 #endif // fstb_ARCHI_X86
602
603
604
605 template <class TS, class TD>
process_plane_int_any_cpp(Plane<> dst,PlaneRO<> src,int w,int h) const606 void TransLut::process_plane_int_any_cpp (Plane <> dst, PlaneRO <> src, int w, int h) const noexcept
607 {
608 assert (dst.is_valid (h));
609 assert (src.is_valid (h));
610 assert (w > 0);
611 assert (h > 0);
612
613 for (int y = 0; y < h; ++y)
614 {
615 const PlaneRO <TS> s { src };
616 const Plane <TD> d { dst };
617
618 for (int x = 0; x < w; ++x)
619 {
620 const int index = s._ptr [x];
621 d._ptr [x] = _lut.use <TD> (index);
622 }
623
624 src.step_line ();
625 dst.step_line ();
626 }
627 }
628
629
630
631 template <class TD, class M>
process_plane_flt_any_cpp(Plane<> dst,PlaneRO<> src,int w,int h) const632 void TransLut::process_plane_flt_any_cpp (Plane <> dst, PlaneRO <> src, int w, int h) const noexcept
633 {
634 assert (dst.is_valid (h));
635 assert (src.is_valid (h));
636 assert (w > 0);
637 assert (h > 0);
638
639 for (int y = 0; y < h; ++y)
640 {
641 const PlaneRO <FloatIntMix> s { src };
642 const Plane <TD> d { dst };
643
644 for (int x = 0; x < w; ++x)
645 {
646 int index;
647 float lerp;
648 M::find_index (s._ptr [x], index, lerp);
649 const float p_0 = _lut.use <float> (index );
650 const float p_1 = _lut.use <float> (index + 1);
651 const float dif = p_1 - p_0;
652 const float val = p_0 + lerp * dif;
653 d._ptr [x] = Convert <TD>::cast (val);
654 }
655
656 src.step_line ();
657 dst.step_line ();
658 }
659 }
660
661
662
663 #if (fstb_ARCHI == fstb_ARCHI_X86)
664
665
666
667 template <class TD, class M>
process_plane_flt_any_sse2(Plane<> dst,PlaneRO<> src,int w,int h) const668 void TransLut::process_plane_flt_any_sse2 (Plane <> dst, PlaneRO <> src, int w, int h) const noexcept
669 {
670 assert (dst.is_valid (h));
671 assert (src.is_valid (h));
672 assert (w > 0);
673 assert (h > 0);
674
675 for (int y = 0; y < h; ++y)
676 {
677 const PlaneRO <FloatIntMix> s { src };
678 const Plane <TD> d { dst };
679
680 for (int x = 0; x < w; x += 4)
681 {
682 union
683 {
684 __m128i _vect;
685 uint32_t _scal [4];
686 } index;
687 __m128 lerp;
688 TransLut_FindIndexSse2 <M>::find_index (s._ptr + x, index._vect, lerp);
689 __m128 val = _mm_set_ps (
690 _lut.use <float> (index._scal [3] ),
691 _lut.use <float> (index._scal [2] ),
692 _lut.use <float> (index._scal [1] ),
693 _lut.use <float> (index._scal [0] )
694 );
695 __m128 va2 = _mm_set_ps (
696 _lut.use <float> (index._scal [3] + 1),
697 _lut.use <float> (index._scal [2] + 1),
698 _lut.use <float> (index._scal [1] + 1),
699 _lut.use <float> (index._scal [0] + 1)
700 );
701 const __m128 dif = _mm_sub_ps (va2, val);
702 val = _mm_add_ps (val, _mm_mul_ps (dif, lerp));
703 TransLut_store_sse2 (&d._ptr [x], val);
704 }
705
706 src.step_line ();
707 dst.step_line ();
708 }
709 }
710
711
712
713 #endif
714
715
716
717 template <class T>
cast(float val)718 T TransLut::Convert <T>::cast (float val) noexcept
719 {
720 return T (fstb::conv_int_fast (val));
721 }
722
723 template <>
cast(float val)724 float TransLut::Convert <float>::cast (float val) noexcept
725 {
726 return val;
727 }
728
729
730
731 } // namespace fmtcl
732
733
734
735 /*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
736