1 /*****************************************************************************
2 
3         TransLut.cpp
4         Author: Laurent de Soras, 2015
5 
6 To do:
7 	- Remove code for destination bitdepth < 16
8 
9 --- Legal stuff ---
10 
11 This program is free software. It comes without any warranty, to
12 the extent permitted by applicable law. You can redistribute it
13 and/or modify it under the terms of the Do What The Fuck You Want
14 To Public License, Version 2, as published by Sam Hocevar. See
15 http://sam.zoy.org/wtfpl/COPYING for more details.
16 
17 *Tab=3***********************************************************************/
18 
19 
20 
21 #if defined (_MSC_VER)
22 	#pragma warning (1 : 4130 4223 4705 4706)
23 	#pragma warning (4 : 4355 4786 4800)
24 #endif
25 
26 
27 
28 /*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
29 
30 #include "fstb/def.h"
31 
32 #include "fmtcl/Cst.h"
33 #include "fmtcl/TransLut.h"
34 #include "fmtcl/TransOpInterface.h"
35 #include "fstb/fnc.h"
36 
37 #if (fstb_ARCHI == fstb_ARCHI_X86)
38 	#include "fstb/ToolsSse2.h"
39 #endif
40 
41 #include <algorithm>
42 
43 #include <cassert>
44 #include <cmath>
45 #include <cstdlib>
46 
47 
48 
49 namespace fmtcl
50 {
51 
52 
53 
54 #if (fstb_ARCHI == fstb_ARCHI_X86)
55 
56 
57 
58 template <class M>
59 class TransLut_FindIndexSse2
60 {
61 public:
62 	static const int  LINLUT_RES_L2  = TransLut::LINLUT_RES_L2;
63 	static const int  LINLUT_MIN_F   = TransLut::LINLUT_MIN_F;
64 	static const int  LINLUT_MAX_F   = TransLut::LINLUT_MAX_F;
65 	static const int  LINLUT_SIZE_F  = TransLut::LINLUT_SIZE_F;
66 
67 	static const int  LOGLUT_MIN_L2  = TransLut::LOGLUT_MIN_L2;
68 	static const int  LOGLUT_MAX_L2  = TransLut::LOGLUT_MAX_L2;
69 	static const int  LOGLUT_RES_L2  = TransLut::LOGLUT_RES_L2;
70 	static const int  LOGLUT_HSIZE   = TransLut::LOGLUT_HSIZE;
71 	static const int  LOGLUT_SIZE    = TransLut::LOGLUT_SIZE;
72 
73 	static inline void
74 		            find_index (const TransLut::FloatIntMix val_arr [4], __m128i &index, __m128 &frac) noexcept;
75 };
76 
77 
78 
79 template <>
find_index(const TransLut::FloatIntMix val_arr[4],__m128i & index,__m128 & frac)80 void	TransLut_FindIndexSse2 <TransLut::MapperLin>::find_index (const TransLut::FloatIntMix val_arr [4], __m128i &index, __m128 &frac) noexcept
81 {
82 	assert (val_arr != nullptr);
83 
84 	constexpr int  offset    = -LINLUT_MIN_F * (1 << LINLUT_RES_L2);
85 	const __m128   scale     = _mm_set1_ps (1 << LINLUT_RES_L2);
86 	const __m128i  offset_ps = _mm_set1_epi32 (offset);
87 	const __m128   val_min   = _mm_set1_ps (0                 - offset);
88 	const __m128   val_max   = _mm_set1_ps (LINLUT_SIZE_F - 2 - offset);
89 
90 	const __m128   v         =
91 		_mm_load_ps (reinterpret_cast <const float *> (val_arr));
92 	__m128         val_scl   = _mm_mul_ps (v, scale);
93 	val_scl = _mm_min_ps (val_scl, val_max);
94 	val_scl = _mm_max_ps (val_scl, val_min);
95 	const __m128i  index_raw = _mm_cvtps_epi32 (val_scl);
96 	index     = _mm_add_epi32 (index_raw, offset_ps);
97 	frac      = _mm_sub_ps (val_scl, _mm_cvtepi32_ps (index_raw));
98 }
99 
100 
101 
102 template <>
find_index(const TransLut::FloatIntMix val_arr[4],__m128i & index,__m128 & frac)103 void	TransLut_FindIndexSse2 <TransLut::MapperLog>::find_index (const TransLut::FloatIntMix val_arr [4], __m128i &index, __m128 &frac) noexcept
104 {
105 	assert (val_arr != nullptr);
106 
107 	// Constants
108 	constexpr int        mant_size = 23;
109 	constexpr int        exp_bias  = 127;
110 	constexpr uint32_t   base      = (exp_bias + LOGLUT_MIN_L2) << mant_size;
111 	constexpr float      val_min   = 1.0f / (int64_t (1) << -LOGLUT_MIN_L2);
112 //	constexpr float      val_max   = float (int64_t (1) << LOGLUT_MAX_L2);
113 	constexpr int        frac_size = mant_size - LOGLUT_RES_L2;
114 	constexpr uint32_t   frac_mask = (1 << frac_size) - 1;
115 
116 	const __m128   zero_f     = _mm_setzero_ps ();
117 	const __m128   one_f      = _mm_set1_ps (1);
118 	const __m128   frac_mul   = _mm_set1_ps (1.0f / (1 << frac_size));
119 	const __m128   mul_eps    = _mm_set1_ps (1.0f / val_min);
120 	const __m128   mask_abs_f = _mm_load_ps (
121 		reinterpret_cast <const float *> (fstb::ToolsSse2::_mask_abs)
122 	);
123 
124 	const __m128i  zero_i          = _mm_setzero_si128 ();
125 	const __m128i  mask_abs_epi32  = _mm_set1_epi32 (0x7FFFFFFF);
126 	const __m128i  one_epi32       = _mm_set1_epi32 (1);
127 	const __m128i  base_epi32      = _mm_set1_epi32 (int (base));
128 	const __m128i  frac_mask_epi32 = _mm_set1_epi32 (frac_mask);
129 	const __m128i  val_min_epi32   =
130 		_mm_set1_epi32 ((LOGLUT_MIN_L2 + exp_bias) << mant_size);
131 	const __m128i  val_max_epi32   =
132 		_mm_set1_epi32 ((LOGLUT_MAX_L2 + exp_bias) << mant_size);
133 	const __m128i  index_max_epi32 =
134 		_mm_set1_epi32 ((LOGLUT_MAX_L2 - LOGLUT_MIN_L2) << LOGLUT_RES_L2);
135 	const __m128i  hsize_epi32     = _mm_set1_epi32 (LOGLUT_HSIZE);
136 	const __m128i  mirror_epi32    = _mm_set1_epi32 (LOGLUT_HSIZE - 1);
137 
138 	// It really starts here
139 	const __m128   val_f = _mm_load_ps (reinterpret_cast <const float *> (val_arr));
140 	const __m128   val_a = _mm_and_ps (val_f, mask_abs_f);
141 	const __m128i  val_i = _mm_load_si128 (reinterpret_cast <const __m128i *> (val_arr));
142 	const __m128i  val_u = _mm_and_si128 (val_i, mask_abs_epi32);
143 
144 	// Standard path
145 	__m128i        index_std = _mm_sub_epi32 (val_u, base_epi32);
146 	index_std = _mm_srli_epi32 (index_std, frac_size);
147 	index_std = _mm_add_epi32 (index_std, one_epi32);
148 	__m128i        frac_stdi = _mm_and_si128 (val_u, frac_mask_epi32);
149 	__m128         frac_std  = _mm_cvtepi32_ps (frac_stdi);
150 	frac_std  = _mm_mul_ps (frac_std, frac_mul);
151 
152 	// Epsilon path
153 	__m128         frac_eps  = _mm_max_ps (val_a, zero_f);
154 	frac_eps = _mm_mul_ps (frac_eps, mul_eps);
155 
156 	// Range cases
157 	const __m128i  eps_flag_i = _mm_cmpgt_epi32 (val_min_epi32, val_u);
158 	const __m128i  std_flag_i = _mm_cmpgt_epi32 (val_max_epi32, val_u);
159 	const __m128   eps_flag_f = _mm_castsi128_ps (eps_flag_i);
160 	const __m128   std_flag_f = _mm_castsi128_ps (std_flag_i);
161 	__m128i        index_tmp  =
162 		fstb::ToolsSse2::select (std_flag_i, index_std, index_max_epi32);
163 	__m128         frac_tmp   =
164 		fstb::ToolsSse2::select (std_flag_f, frac_std, one_f);
165 	index_tmp = fstb::ToolsSse2::select (eps_flag_i, zero_i, index_tmp);
166 	frac_tmp  = fstb::ToolsSse2::select (eps_flag_f, frac_eps, frac_tmp);
167 
168 	// Sign cases
169 	const __m128i  neg_flag_i = _mm_srai_epi32 (val_i, 31);
170 	const __m128   neg_flag_f = _mm_castsi128_ps (neg_flag_i);
171 	const __m128i  index_neg  = _mm_sub_epi32 (mirror_epi32, index_tmp);
172 	const __m128i  index_pos  = _mm_add_epi32 (hsize_epi32, index_tmp);
173 	const __m128   frac_neg   = _mm_sub_ps (one_f, frac_tmp);
174 	index = fstb::ToolsSse2::select (neg_flag_i, index_neg, index_pos);
175 	frac  = fstb::ToolsSse2::select (neg_flag_f, frac_neg, frac_tmp);
176 }
177 
178 
179 
180 template <class T>
TransLut_store_sse2(T * dst_ptr,__m128 val)181 static fstb_FORCEINLINE void	TransLut_store_sse2 (T *dst_ptr, __m128 val) noexcept
182 {
183 	_mm_store_si128 (
184 		reinterpret_cast <__m128i *> (dst_ptr),
185 		_mm_cvtps_epi32 (val)
186 	);
187 }
188 
TransLut_store_sse2(float * dst_ptr,__m128 val)189 static fstb_FORCEINLINE void	TransLut_store_sse2 (float *dst_ptr, __m128 val) noexcept
190 {
191 	_mm_store_ps (dst_ptr, val);
192 }
193 
194 
195 
196 #endif   // fstb_ARCHI_X86
197 
198 
199 
200 /*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
201 
202 
203 
204 constexpr int	TransLut::LINLUT_RES_L2;
205 constexpr int	TransLut::LINLUT_MIN_F;
206 constexpr int	TransLut::LINLUT_MAX_F;
207 constexpr int	TransLut::LINLUT_SIZE_F;
208 constexpr int	TransLut::LOGLUT_MIN_L2;
209 constexpr int	TransLut::LOGLUT_MAX_L2;
210 constexpr int	TransLut::LOGLUT_RES_L2;
211 constexpr int	TransLut::LOGLUT_HSIZE;
212 constexpr int	TransLut::LOGLUT_SIZE;
213 
214 
215 
TransLut(const TransOpInterface & curve,bool log_flag,SplFmt src_fmt,int src_bits,bool src_full_flag,SplFmt dst_fmt,int dst_bits,bool dst_full_flag,bool sse2_flag,bool avx2_flag)216 TransLut::TransLut (const TransOpInterface &curve, bool log_flag, SplFmt src_fmt, int src_bits, bool src_full_flag, SplFmt dst_fmt, int dst_bits, bool dst_full_flag, bool sse2_flag, bool avx2_flag)
217 :	_loglut_flag (log_flag)
218 ,	_src_fmt (src_fmt)
219 ,	_src_bits (src_bits)
220 ,	_src_full_flag (src_full_flag)
221 ,	_dst_fmt (dst_fmt)
222 ,	_dst_bits (dst_bits)
223 ,	_dst_full_flag (dst_full_flag)
224 ,	_sse2_flag (sse2_flag)
225 ,	_avx2_flag (avx2_flag)
226 {
227 	assert (src_fmt >= 0);
228 	assert (src_fmt < SplFmt_NBR_ELT);
229 	assert (src_bits >= 8);
230 	assert (dst_fmt >= 0);
231 	assert (dst_fmt < SplFmt_NBR_ELT);
232 	assert (dst_bits >= 8);
233 
234 	generate_lut (curve);
235 	init_proc_fnc ();
236 }
237 
238 
239 
process_plane(const Plane<> & dst,const PlaneRO<> & src,int w,int h) const240 void	TransLut::process_plane (const Plane <> &dst, const PlaneRO <> &src, int w, int h) const noexcept
241 {
242 	assert (dst.is_valid (h));
243 	assert (src.is_valid (h));
244 	assert (w > 0);
245 	assert (h > 0);
246 
247 	assert (_process_plane_ptr != nullptr);
248 	(this->*_process_plane_ptr) (dst, src, w, h);
249 }
250 
251 
252 
MapperLin(int lut_size,double range_beg,double range_lst)253 TransLut::MapperLin::MapperLin (int lut_size, double range_beg, double range_lst) noexcept
254 :	_lut_size (lut_size)
255 ,	_range_beg (range_beg)
256 ,	_step ((range_lst - range_beg) / (lut_size - 1))
257 {
258 	assert (lut_size >= 2);
259 	assert (range_beg < range_lst);
260 }
261 
262 
263 
find_index(const FloatIntMix & val,int & index,float & frac)264 void	TransLut::MapperLin::find_index (const FloatIntMix &val, int &index, float &frac) noexcept
265 {
266 	const float    val_scl   = val._f * (1 << LINLUT_RES_L2);
267 	const int      index_raw = fstb::floor_int (val_scl);
268 	constexpr int  offset    = -LINLUT_MIN_F * (1 << LINLUT_RES_L2);
269 	index = fstb::limit (index_raw + offset, 0, LINLUT_SIZE_F - 2);
270 	frac  = val_scl - float (index_raw);
271 }
272 
273 
274 
find_val(int index) const275 double	TransLut::MapperLin::find_val (int index) const noexcept
276 {
277 	return _range_beg + index * _step;
278 }
279 
280 
281 
find_index(const FloatIntMix & val,int & index,float & frac)282 void	TransLut::MapperLog::find_index (const FloatIntMix &val, int &index, float &frac) noexcept
283 {
284 	static_assert (LOGLUT_MIN_L2 <= 0, "LOGLUT_MIN_L2 must be negative");
285 	static_assert (LOGLUT_MAX_L2 >= 0, "LOGLUT_MAX_L2 must be positive");
286 
287 	constexpr int        mant_size = 23;
288 	constexpr int        exp_bias  = 127;
289 	constexpr uint32_t   base      = (exp_bias + LOGLUT_MIN_L2) << mant_size;
290 	constexpr float      val_min   = 1.0f / (int64_t (1) << -LOGLUT_MIN_L2);
291 	constexpr float      val_max   = float (int64_t (1) << LOGLUT_MAX_L2);
292 	constexpr int        frac_size = mant_size - LOGLUT_RES_L2;
293 	constexpr uint32_t   frac_mask = (1 << frac_size) - 1;
294 
295 	const uint32_t val_u = val._i & 0x7FFFFFFF;
296 	const float    val_a = fabsf (val._f);
297 
298 	// index is set relatively to the x=0 index...
299 	if (val_a < val_min)
300 	{
301 		index = 0;
302 		frac  = std::max (val_a, 0.0f) * (1.0f / val_min);
303 	}
304 	else if (val_a >= val_max)
305 	{
306 		index = ((LOGLUT_MAX_L2 - LOGLUT_MIN_L2) << LOGLUT_RES_L2);
307 		frac  = 1;
308 	}
309 	else
310 	{
311 		index = ((val_u - base) >> frac_size) + 1;
312 		frac  = float (val_u & frac_mask) * (1.0f / (1 << frac_size));
313 	}
314 
315 	// ...and shifted or mirrored depending on the sign
316 	if (val._f >= 0)
317 	{
318 		index += LOGLUT_HSIZE;
319 	}
320 	else
321 	{
322 		// Because frac cannot be negative, step one index behind.
323 		index = LOGLUT_HSIZE - 1 - index;
324 		frac  = 1 - frac;
325 	}
326 
327 	assert (index >= 0);
328 	assert (index < LOGLUT_SIZE - 1);
329 	assert (frac >= 0);
330 	assert (frac <= 1);
331 }
332 
333 
334 
find_val(int index) const335 double	TransLut::MapperLog::find_val (int index) const noexcept
336 {
337 	assert (index >= 0);
338 	assert (index < LOGLUT_SIZE);
339 
340 	static constexpr float   val_min  = 1.0f / (int64_t (1) << -LOGLUT_MIN_L2);
341 	static constexpr int     seg_size = 1 << LOGLUT_RES_L2;
342 
343 	// float is OK because the values are exactly represented in float.
344 	float          val   = 0;
345 	int            ind_2 = index - LOGLUT_HSIZE;
346 	if (ind_2 != 0)
347 	{
348 		const int      ind_3     = std::abs (ind_2) - 1;
349 		const int      log2_part = ind_3 >> LOGLUT_RES_L2;
350 		const int      seg_part  = ind_3 & (seg_size - 1);
351 		const float    lerp      = float (seg_part) * (1.0f / seg_size);
352 		const float    v0        = float (int64_t (1) << log2_part) * val_min;
353 		val = v0 * (1 + lerp);
354 		if (ind_2 < 0)
355 		{
356 			val = -val;
357 		}
358 	}
359 
360 	return val;
361 }
362 
363 
364 
365 // For float input. Only checks the curvature, not the extended range
is_loglut_req(const TransOpInterface & curve)366 bool	TransLut::is_loglut_req (const TransOpInterface &curve)
367 {
368 	// Delta to compute the slope
369 	constexpr double  delta = 1.0 / 65536;
370 
371 	// Slope at 1, for reference
372 	// Curve may be clipping early because of contrast increase, so we
373 	// try smaller values
374 	double         x1 = 1;
375 	double         s1 = 0;
376 	do
377 	{
378 		const double   v1  = curve (x1);
379 		const double   v1d = curve (x1 - delta);
380 		s1 = (v1 - v1d) / delta;
381 		x1 *= 0.5;
382 	}
383 	while (s1 <= 0 && x1 >= 0.01);
384 	// At this point s1 may still be 0, we will ignore the result.
385 
386 	// Slope at 0
387 	const double   v0  = curve (0);
388 	const double   v0d = curve (0 + delta);
389 	const double   s0  = (v0d - v0) / delta;
390 	assert (s0 > 0);
391 
392 	// Arbitrary factor, seems to work decently
393 	if (s1 > 0 && s0 >= 50 * s1)
394 	{
395 		return true;
396 	}
397 
398 	// Slope close to 0
399 	constexpr double  xs = 1.0 / 4096;
400 	const double   vsn = curve (xs - delta * 0.5);
401 	const double   vsp = curve (xs + delta * 0.5);
402 	const double   ss  = (vsp - vsn) / delta;
403 	assert (ss > 0);
404 
405 	if (s0 >= 3 * ss)
406 	{
407 		return true;
408 	}
409 
410 	return false;
411 }
412 
413 
414 
415 /*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
416 
417 
418 
419 /*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
420 
421 
422 
generate_lut(const TransOpInterface & curve)423 void	TransLut::generate_lut (const TransOpInterface &curve)
424 {
425 	if (_src_fmt == SplFmt_FLOAT)
426 	{
427 		// When the source is float, the LUT output is always float
428 		// so we can interpolate it easily and obtain the exact values.
429 		// If the target data type is int, we quantize the interpolated
430 		// values as a second step.
431 		_lut.set_type <float> ();
432 
433 		if (_loglut_flag)
434 		{
435 			_lut.resize (LOGLUT_SIZE);
436 			MapperLog   mapper;
437 			generate_lut_flt <float> (curve, mapper);
438 		}
439 		else
440 		{
441 			_lut.resize (LINLUT_SIZE_F);
442 			MapperLin   mapper (LINLUT_SIZE_F, LINLUT_MIN_F, LINLUT_MAX_F);
443 			generate_lut_flt <float> (curve, mapper);
444 		}
445 	}
446 
447 	else
448 	{
449 		_loglut_flag = false;
450 
451 		int            range = 1 << _src_bits;
452 		if (_src_fmt == SplFmt_INT8)
453 		{
454 			_lut.resize (1 << 8);
455 		}
456 		else
457 		{
458 			_lut.resize (1 << 16);
459 		}
460 		const int      sb16  = (_src_full_flag) ? 0      : Cst::_rtv_lum_blk << 8;
461 		const int      sw16  = (_src_full_flag) ? 0xFFFF : Cst::_rtv_lum_wht << 8;
462 		int            sbn   = sb16 >> (16 - _src_bits);
463 		int            swn   = sw16 >> (16 - _src_bits);
464 		const int      sdif  = swn - sbn;
465 		const double   r_beg = double (0         - sbn) / sdif;
466 		const double   r_lst = double (range - 1 - sbn) / sdif;
467 		if (_dst_fmt == SplFmt_FLOAT)
468 		{
469 			_lut.set_type <float> ();
470 			MapperLin      mapper (range, r_beg, r_lst);
471 			generate_lut_flt <float> (curve, mapper);
472 		}
473 		else
474 		{
475 			const int      db16 = (_dst_full_flag) ? 0      : Cst::_rtv_lum_blk << 8;
476 			const int      dw16 = (_dst_full_flag) ? 0xFFFF : Cst::_rtv_lum_wht << 8;
477 			int            dbn  = db16 >> (16 - _dst_bits);
478 			int            dwn  = dw16 >> (16 - _dst_bits);
479 			const double   mul  = dwn - dbn;
480 			const double   add  = dbn;
481 			if (_dst_bits > 8)
482 			{
483 				_lut.set_type <uint16_t> ();
484 				generate_lut_int <uint16_t> (
485 					curve, range, r_beg, r_lst, mul, add
486 				);
487 			}
488 			else
489 			{
490 				_lut.set_type <uint8_t> ();
491 				generate_lut_int <uint8_t> (
492 					curve, range, r_beg, r_lst, mul, add
493 				);
494 			}
495 		}
496 	}
497 }
498 
499 
500 
501 // T = LUT data type (int or float)
502 template <class T>
generate_lut_int(const TransOpInterface & curve,int lut_size,double range_beg,double range_lst,double mul,double add)503 void	TransLut::generate_lut_int (const TransOpInterface &curve, int lut_size, double range_beg, double range_lst, double mul, double add)
504 {
505 	assert (_dst_fmt != SplFmt_FLOAT);
506 	assert (lut_size > 1);
507 	assert (range_beg < range_lst);
508 
509 	const double   scale   = (range_lst - range_beg) / (lut_size - 1);
510 	const int      max_val = (1 << _dst_bits) - 1;
511 	for (int pos = 0; pos < lut_size; ++pos)
512 	{
513 		const double   x = range_beg + pos * scale;
514 		const double   y = curve (x) * mul + add;
515 		_lut.use <T> (pos) = T (fstb::limit (fstb::round_int (y), 0, max_val));
516 	}
517 }
518 
519 
520 
521 // T = float
522 template <class T, class M>
generate_lut_flt(const TransOpInterface & curve,const M & mapper)523 void	TransLut::generate_lut_flt (const TransOpInterface &curve, const M &mapper)
524 {
525 	const int      lut_size = mapper.get_lut_size ();
526 	for (int pos = 0; pos < lut_size; ++pos)
527 	{
528 		const double   x = mapper.find_val (pos);
529 		const double   y = curve (x);
530 		_lut.use <T> (pos) = T (y);
531 	}
532 }
533 
534 
535 
init_proc_fnc()536 void	TransLut::init_proc_fnc ()
537 {
538 	assert (! _loglut_flag || _src_fmt == SplFmt_FLOAT);
539 
540 	const int      s =
541 		  (_loglut_flag            ) ? 0
542 		: (_src_fmt == SplFmt_FLOAT) ? 1
543 		: (_src_bits > 8           ) ? 2
544 		:                              3;
545 	const int      d =
546 		  (_dst_fmt == SplFmt_FLOAT) ? 0
547 		: (_dst_bits > 8           ) ? 1
548 		:                              2;
549 
550 	const int      selector = d * 4 + s;
551 
552 	switch (selector)
553 	{
554 	case 0*4+0:	_process_plane_ptr = &ThisType::process_plane_flt_any_cpp  <          float   , MapperLog>; break;
555 	case 0*4+1:	_process_plane_ptr = &ThisType::process_plane_flt_any_cpp  <          float   , MapperLin>; break;
556 	case 0*4+2:	_process_plane_ptr = &ThisType::process_plane_int_any_cpp  <uint16_t, float              >; break;
557 	case 0*4+3:	_process_plane_ptr = &ThisType::process_plane_int_any_cpp  <uint8_t , float              >; break;
558 	case 1*4+0:	_process_plane_ptr = &ThisType::process_plane_flt_any_cpp  <          uint16_t, MapperLog>; break;
559 	case 1*4+1:	_process_plane_ptr = &ThisType::process_plane_flt_any_cpp  <          uint16_t, MapperLin>; break;
560 	case 1*4+2:	_process_plane_ptr = &ThisType::process_plane_int_any_cpp  <uint16_t, uint16_t           >; break;
561 	case 1*4+3:	_process_plane_ptr = &ThisType::process_plane_int_any_cpp  <uint8_t , uint16_t           >; break;
562 	case 2*4+0:	_process_plane_ptr = &ThisType::process_plane_flt_any_cpp  <          uint8_t , MapperLog>; break;
563 	case 2*4+1:	_process_plane_ptr = &ThisType::process_plane_flt_any_cpp  <          uint8_t , MapperLin>; break;
564 	case 2*4+2:	_process_plane_ptr = &ThisType::process_plane_int_any_cpp  <uint16_t, uint8_t            >; break;
565 	case 2*4+3:	_process_plane_ptr = &ThisType::process_plane_int_any_cpp  <uint8_t , uint8_t            >; break;
566 
567 	default:
568 		assert (false);
569 		break;
570 	}
571 #if (fstb_ARCHI == fstb_ARCHI_X86)
572 	init_proc_fnc_sse2 (selector);
573 	init_proc_fnc_avx2 (selector);
574 #endif
575 }
576 
577 
578 
579 #if (fstb_ARCHI == fstb_ARCHI_X86)
580 
init_proc_fnc_sse2(int selector)581 void	TransLut::init_proc_fnc_sse2 (int selector)
582 {
583 	if (_sse2_flag && _src_fmt == SplFmt_FLOAT)
584 	{
585 		switch (selector)
586 		{
587 		case 0*4+0:	_process_plane_ptr = &ThisType::process_plane_flt_any_sse2 <float   , MapperLog>; break;
588 		case 0*4+1:	_process_plane_ptr = &ThisType::process_plane_flt_any_sse2 <float   , MapperLin>; break;
589 		case 1*4+0:	_process_plane_ptr = &ThisType::process_plane_flt_any_sse2 <uint16_t, MapperLog>; break;
590 		case 1*4+1:	_process_plane_ptr = &ThisType::process_plane_flt_any_sse2 <uint16_t, MapperLin>; break;
591 		case 2*4+0:	_process_plane_ptr = &ThisType::process_plane_flt_any_sse2 <uint8_t , MapperLog>; break;
592 		case 2*4+1:	_process_plane_ptr = &ThisType::process_plane_flt_any_sse2 <uint8_t , MapperLin>; break;
593 
594 		default:
595 			// Nothing
596 			break;
597 		}
598 	}
599 }
600 
601 #endif   // fstb_ARCHI_X86
602 
603 
604 
605 template <class TS, class TD>
process_plane_int_any_cpp(Plane<> dst,PlaneRO<> src,int w,int h) const606 void	TransLut::process_plane_int_any_cpp (Plane <> dst, PlaneRO <> src, int w, int h) const noexcept
607 {
608 	assert (dst.is_valid (h));
609 	assert (src.is_valid (h));
610 	assert (w > 0);
611 	assert (h > 0);
612 
613 	for (int y = 0; y < h; ++y)
614 	{
615 		const PlaneRO <TS>   s { src };
616 		const Plane <TD>     d { dst };
617 
618 		for (int x = 0; x < w; ++x)
619 		{
620 			const int          index = s._ptr [x];
621 			d._ptr [x] = _lut.use <TD> (index);
622 		}
623 
624 		src.step_line ();
625 		dst.step_line ();
626 	}
627 }
628 
629 
630 
631 template <class TD, class M>
process_plane_flt_any_cpp(Plane<> dst,PlaneRO<> src,int w,int h) const632 void	TransLut::process_plane_flt_any_cpp (Plane <> dst, PlaneRO <> src, int w, int h) const noexcept
633 {
634 	assert (dst.is_valid (h));
635 	assert (src.is_valid (h));
636 	assert (w > 0);
637 	assert (h > 0);
638 
639 	for (int y = 0; y < h; ++y)
640 	{
641 		const PlaneRO <FloatIntMix>   s { src };
642 		const Plane <TD>              d { dst };
643 
644 		for (int x = 0; x < w; ++x)
645 		{
646 			int                index;
647 			float              lerp;
648 			M::find_index (s._ptr [x], index, lerp);
649 			const float        p_0  = _lut.use <float> (index    );
650 			const float        p_1  = _lut.use <float> (index + 1);
651 			const float        dif  = p_1 - p_0;
652 			const float        val  = p_0 + lerp * dif;
653 			d._ptr [x] = Convert <TD>::cast (val);
654 		}
655 
656 		src.step_line ();
657 		dst.step_line ();
658 	}
659 }
660 
661 
662 
663 #if (fstb_ARCHI == fstb_ARCHI_X86)
664 
665 
666 
667 template <class TD, class M>
process_plane_flt_any_sse2(Plane<> dst,PlaneRO<> src,int w,int h) const668 void	TransLut::process_plane_flt_any_sse2 (Plane <> dst, PlaneRO <> src, int w, int h) const noexcept
669 {
670 	assert (dst.is_valid (h));
671 	assert (src.is_valid (h));
672 	assert (w > 0);
673 	assert (h > 0);
674 
675 	for (int y = 0; y < h; ++y)
676 	{
677 		const PlaneRO <FloatIntMix>   s { src };
678 		const Plane <TD>              d { dst };
679 
680 		for (int x = 0; x < w; x += 4)
681 		{
682 			union
683 			{
684 				__m128i            _vect;
685 				uint32_t           _scal [4];
686 			}                  index;
687 			__m128             lerp;
688 			TransLut_FindIndexSse2 <M>::find_index (s._ptr + x, index._vect, lerp);
689 			__m128             val = _mm_set_ps (
690 				_lut.use <float> (index._scal [3]    ),
691 				_lut.use <float> (index._scal [2]    ),
692 				_lut.use <float> (index._scal [1]    ),
693 				_lut.use <float> (index._scal [0]    )
694 			);
695 			__m128             va2 = _mm_set_ps (
696 				_lut.use <float> (index._scal [3] + 1),
697 				_lut.use <float> (index._scal [2] + 1),
698 				_lut.use <float> (index._scal [1] + 1),
699 				_lut.use <float> (index._scal [0] + 1)
700 			);
701 			const __m128       dif = _mm_sub_ps (va2, val);
702 			val = _mm_add_ps (val, _mm_mul_ps (dif, lerp));
703 			TransLut_store_sse2 (&d._ptr [x], val);
704 		}
705 
706 		src.step_line ();
707 		dst.step_line ();
708 	}
709 }
710 
711 
712 
713 #endif
714 
715 
716 
717 template <class T>
cast(float val)718 T	TransLut::Convert <T>::cast (float val) noexcept
719 {
720 	return T (fstb::conv_int_fast (val));
721 }
722 
723 template <>
cast(float val)724 float	TransLut::Convert <float>::cast (float val) noexcept
725 {
726 	return val;
727 }
728 
729 
730 
731 }	// namespace fmtcl
732 
733 
734 
735 /*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
736