1 /*
2  * LUT.h
3  *  This file is part of RawTherapee.
4  *
5  *  Copyright (c) 2011 Jan Rinze Peterzon (janrinze@gmail.com)
6  *
7  *  RawTherapee is free software: you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation, either version 3 of the License, or
10  *  (at your option) any later version.
11  *
12  *  RawTherapee is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  *
17  *  You should have received a copy of the GNU General Public License
18  *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 /*
22  *  Declaration of flexible Lookup Tables
23  *
24  *  Usage:
25  *
26  *      LUT<type> name (size);
27  *      LUT<type> name (size, flags);
28  *
29  *      creates an array which is valid within the normal C/C++ scope "{ ... }"
30  *
31  *      access to elements is a simple as:
32  *
33  *          LUT<float> my_lut (10);
34  *          float value = my_lut[3];
35  *          float value = my_lut[2.5]; // this will interpolate
36  *
37  *      when using a float type index it will interpolate the lookup values
38  *
39  *      extra setting in flags: (clipping is set by default)
40  *      LUT_CLIP_ABOVE
41  *      LUT_CLIP_BELOW
42  *
43  *      example:
44  *          LUT<float> my_lut (10,LUT_CLIP_BELOW);
45  *          float value = my_lut[22.5];  // this will extrapolate
46  *          float value = my_lut[-22.5]; // this will not extrapolate
47  *
48  *          LUT<float> my_lut (10,0); // this will extrapolate on either side
49  *
50  *      shotcuts:
51  *
52  *          LUTf stands for LUT<float>
53  *          LUTi stands for LUT<int>
54  *          LUTu stands for LUT<unsigned int>
55  *          LUTd stands for LUT<double>
56  *          LUTuc stands for LUT<unsigned char>
57  */
58 
59 #ifndef LUT_H_
60 #define LUT_H_
61 
62 #include <cstring>
63 #include <cstdint>
64 #include <cassert>
65 
66 #ifndef NDEBUG
67 #include <glibmm.h>
68 #include <fstream>
69 #endif
70 
71 #include "opthelper.h"
72 #include "rt_math.h"
73 #include "noncopyable.h"
74 
75 // Bit representations of flags
76 enum {
77     LUT_CLIP_OFF,   // LUT does not clip input values
78     LUT_CLIP_BELOW, // LUT clips input values at lower bound
79     LUT_CLIP_ABOVE  // LUT clips input values at upper bound
80 };
81 
82 template<typename T>
83 class LUT;
84 
85 using LUTf = LUT<float>;
86 using LUTi = LUT<int32_t>;
87 using LUTu = LUT<uint32_t>;
88 using LUTd = LUT<double>;
89 using LUTuc = LUT<uint8_t>;
90 
91 template<typename T>
92 class LUT
93 {
94 protected:
95     // list of variables ordered to improve cache speed
96     int maxs;
97     float maxsf;
98     T * data;
99     unsigned int clip;
100     unsigned int size;
101     unsigned int upperBound;  // always equals size-1, parameter created for performance reason
102 private:
103     unsigned int owner;
104 #ifdef __SSE2__
105     alignas(16) vfloat maxsv;
106     alignas(16) vfloat sizev;
107     alignas(16) vint sizeiv;
108 #endif
109 public:
110     /// convenience flag! If one doesn't want to delete the buffer but want to flag it to be recomputed...
111     /// The user have to handle it itself, even if some method can (re)initialize it
112     bool dirty;
113 
114     LUT(int s, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE, bool initZero = false)
115     {
116 #ifndef NDEBUG
117 
118         if (s <= 0) {
119             printf("s<=0!\n");
120         }
121 
122         assert (s > 0);
123 #endif
124         dirty = true;
125         clip = flags;
126         // Add a few extra elements so [](vfloat) won't access out-of-bounds memory.
127         // The routine would still produce the right answer, but might cause issues
128         // with address/heap checking programs.
129         data = new T[s + 3];
130         owner = 1;
131         size = s;
132         upperBound = size - 1;
133         maxs = size - 2;
134         maxsf = (float)maxs;
135 #ifdef __SSE2__
136         maxsv =  F2V( maxs );
137         sizeiv =  _mm_set1_epi32( (int)(size - 1) );
138         sizev = F2V( size - 1 );
139 #endif
140         if (initZero) {
141             clear();
142         }
143     }
operator()144     void operator ()(int s, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE, bool initZero = false)
145     {
146 #ifndef NDEBUG
147 
148         if (s <= 0) {
149             printf("s<=0!\n");
150         }
151 
152         assert (s > 0);
153 #endif
154 
155         if (owner && data) {
156             delete[] data;
157         }
158 
159         dirty = true; // Assumption!
160         clip = flags;
161         // See comment in constructor.
162         data = new T[s + 3];
163         owner = 1;
164         size = s;
165         upperBound = size - 1;
166         maxs = size - 2;
167         maxsf = (float)maxs;
168 #ifdef __SSE2__
169         maxsv =  F2V( maxs );
170         sizeiv =  _mm_set1_epi32( (int)(size - 1) );
171         sizev = F2V( size - 1 );
172 #endif
173         if (initZero) {
174             clear();
175         }
176 
177     }
178 
LUT()179     LUT()
180     {
181         data = nullptr;
182         reset();
183 #ifdef __SSE2__
184         maxsv = ZEROV;
185         sizev = ZEROV;
186         sizeiv = _mm_setzero_si128();
187 #endif
188     }
189 
~LUT()190     ~LUT()
191     {
192         if (owner) {
193             delete[] data;
194 #ifndef NDEBUG
195             data = (T*)0xBAADF00D;
196 #endif
197         }
198     }
199 
200     explicit LUT(const LUT&) = delete;
201 
setClip(int flags)202     void setClip(int flags)
203     {
204         clip = flags;
205     }
206 
getClip()207     int getClip() const {
208         return clip;
209     }
210 
211     /** @brief Get the number of element in the LUT (i.e. dimension of the array)
212      *  For a LUT(500), it will return 500
213      *  @return number of element in the array
214      */
getSize()215     unsigned int getSize() const
216     {
217         return size;
218     }
219 
220     /** @brief Get the highest value possible (i.e. dimension of the array)
221      *  For a LUT(500), it will return 499, because 500 elements, starting from 0, goes up to 499
222      *  @return number of element in the array
223      */
getUpperBound()224     unsigned int getUpperBound() const
225     {
226         return size > 0 ? upperBound : 0;
227     }
228 
229     LUT<T> & operator=(const LUT<T>& rhs)
230     {
231         if (this != &rhs) {
232             if (rhs.size > this->size) {
233                 delete [] this->data;
234                 this->data = nullptr;
235             }
236 
237             if (this->data == nullptr) {
238                 // See comment in constructor.
239                 this->data = new T[rhs.size + 3];
240             }
241 
242             this->clip = rhs.clip;
243             this->owner = 1;
244             memcpy(this->data, rhs.data, rhs.size * sizeof(T));
245             this->size = rhs.size;
246             this->upperBound = rhs.upperBound;
247             this->maxs = this->size - 2;
248             this->maxsf = (float)this->maxs;
249 #ifdef __SSE2__
250             this->maxsv =  F2V( this->size - 2);
251             this->sizeiv =  _mm_set1_epi32( (int)(this->size - 1) );
252             this->sizev = F2V( this->size - 1 );
253 #endif
254         }
255 
256         return *this;
257     }
258 
259     // handy to sum up per thread histograms. #pragma omp simd speeds up the loop by about factor 3 for LUTu (uint32_t).
260     template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
261     LUT<T> & operator+=(const LUT<T>& rhs)
262     {
263         if (rhs.size == this->size) {
264 #ifdef _OPENMP
265             #pragma omp simd
266 #endif
267 
268             for(unsigned int i = 0; i < this->size; i++) {
269                 data[i] += rhs.data[i];
270             }
271         }
272 
273         return *this;
274     }
275 
276     // multiply all elements of LUT<float> with a constant float value
277     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
278     LUT<float> & operator*=(float factor)
279     {
280 #ifdef _OPENMP
281         #pragma omp simd
282 #endif
283 
284         for(unsigned int i = 0; i < this->size; i++) {
285             data[i] *= factor;
286         }
287 
288         return *this;
289     }
290 
291     // divide all elements of LUT<float> by a constant float value
292     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
293     LUT<float> & operator/=(float divisor)
294     {
295 #ifdef _OPENMP
296         #pragma omp simd
297 #endif
298 
299         for(unsigned int i = 0; i < this->size; i++) {
300             data[i] /= divisor;
301         }
302 
303         return *this;
304     }
305 
306 
307     // use with integer indices
308     T& operator[](int index) const
309     {
310         return data[ rtengine::LIM<int>(index, 0, upperBound) ];
311     }
312 
313 #ifdef __SSE2__
314 
315 
316     // NOTE: This function requires LUTs which clips only at lower bound
cb(vfloat indexv)317     vfloat cb(vfloat indexv) const
318     {
319         static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
320 
321         // Clamp and convert to integer values. Extract out of SSE register because all
322         // lookup operations use regular addresses.
323         vfloat clampedIndexes = vclampf(indexv, ZEROV, maxsv); // this automagically uses ZEROV in case indexv is NaN
324         vint indexes = _mm_cvttps_epi32(clampedIndexes);
325         int indexArray[4];
326         _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
327 
328         // Load data from the table. This reads more than necessary, but there don't seem
329         // to exist more granular operations (though we could try non-SSE).
330         // Cast to int for convenience in the next operation (partial transpose).
331         vint values[4];
332         for (int i = 0; i < 4; ++i) {
333             values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
334         }
335 
336         // Partial 4x4 transpose operation. We want two new vectors, the first consisting
337         // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
338         __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
339         __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
340         vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
341         vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
342 
343         vfloat diff = vmaxf(ZEROV, indexv) - _mm_cvtepi32_ps(indexes);
344         return vintpf(diff, upper, lower);
345     }
346 
347     // NOTE: This version requires LUTs which clip at upper and lower bounds
348     // (which is the default).
349     vfloat operator[](vfloat indexv) const
350     {
351         static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
352 
353         // Clamp and convert to integer values. Extract out of SSE register because all
354         // lookup operations use regular addresses.
355         vfloat clampedIndexes = vclampf(indexv, ZEROV, maxsv); // this automagically uses ZEROV in case indexv is NaN
356         vint indexes = _mm_cvttps_epi32(clampedIndexes);
357         int indexArray[4];
358         _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
359 
360         // Load data from the table. This reads more than necessary, but there don't seem
361         // to exist more granular operations (though we could try non-SSE).
362         // Cast to int for convenience in the next operation (partial transpose).
363         vint values[4];
364         for (int i = 0; i < 4; ++i) {
365             values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
366         }
367 
368         // Partial 4x4 transpose operation. We want two new vectors, the first consisting
369         // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
370         __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
371         __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
372         vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
373         vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
374 
375         vfloat diff = vclampf(indexv, ZEROV, sizev) - _mm_cvtepi32_ps(indexes); // this automagically uses ZEROV in case indexv is NaN
376         return vintpf(diff, upper, lower);
377     }
378 
379     // NOTE: This version requires LUTs which do not clip at upper and lower bounds
operator()380     vfloat operator()(vfloat indexv) const
381     {
382         static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
383 
384         // Clamp and convert to integer values. Extract out of SSE register because all
385         // lookup operations use regular addresses.
386         vfloat clampedIndexes = vclampf(indexv, ZEROV, maxsv); // this automagically uses ZEROV in case indexv is NaN
387         vint indexes = _mm_cvttps_epi32(clampedIndexes);
388         int indexArray[4];
389         _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
390 
391         // Load data from the table. This reads more than necessary, but there don't seem
392         // to exist more granular operations (though we could try non-SSE).
393         // Cast to int for convenience in the next operation (partial transpose).
394         vint values[4];
395         for (int i = 0; i < 4; ++i) {
396             values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
397         }
398 
399         // Partial 4x4 transpose operation. We want two new vectors, the first consisting
400         // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
401         __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
402         __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
403         vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
404         vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
405 
406         vfloat diff = indexv - _mm_cvtepi32_ps(indexes);
407         return vintpf(diff, upper, lower);
408     }
409 
410     // vectorized LUT access with integer indices. Clips at lower and upper bounds
411 #ifdef __SSE4_1__
412     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
413     vfloat operator[](vint idxv) const
414     {
415         idxv = _mm_max_epi32( _mm_setzero_si128(), _mm_min_epi32(idxv, sizeiv));
416         // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code
417         return _mm_setr_ps(data[_mm_extract_epi32(idxv,0)], data[_mm_extract_epi32(idxv,1)], data[_mm_extract_epi32(idxv,2)], data[_mm_extract_epi32(idxv,3)]);
418     }
419 #else
420     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
421     vfloat operator[](vint idxv) const
422     {
423         // convert to float because SSE2 has no min/max for 32bit integers
424         vfloat tempv = vclampf(_mm_cvtepi32_ps(idxv), ZEROV, sizev); // this automagically uses ZEROV in case idxv is NaN (which will never happen because it is a vector of int)
425         idxv = _mm_cvttps_epi32(tempv);
426         // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code
427         return _mm_setr_ps(data[_mm_cvtsi128_si32(idxv)],
428                            data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)))],
429                            data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)))],
430                            data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)))]);
431     }
432 #endif
433 #endif
434 
435     // use with float indices
436     template<typename U = T, typename V, typename = typename std::enable_if<std::is_floating_point<V>::value && std::is_same<U, float>::value>::type>
437     T operator[](V index) const
438     {
439         int idx = (int)index;  // don't use floor! The difference in negative space is no problems here
440 
441         if (index < 0.f) {
442             if (clip & LUT_CLIP_BELOW) {
443                 return data[0];
444             }
445 
446             idx = 0;
447         } else if (index > maxsf) {
448             if (clip & LUT_CLIP_ABOVE) {
449                 return data[upperBound];
450             }
451 
452             idx = maxs;
453         }
454 
455         float diff = index - (float) idx;
456         T p1 = data[idx];
457         T p2 = data[idx + 1] - p1;
458         return (p1 + p2 * diff);
459     }
460 
461     // Return the value for "index" that is in the [0-1] range.
462     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
getVal01(float index)463     T getVal01 (float index) const
464     {
465         index *= (float)upperBound;
466         int idx = (int)index;  // don't use floor! The difference in negative space is no problems here
467 
468         if (index < 0.f) {
469             if (clip & LUT_CLIP_BELOW) {
470                 return data[0];
471             }
472 
473             idx = 0;
474         } else if (index > maxsf) {
475             if (clip & LUT_CLIP_ABOVE) {
476                 return data[upperBound];
477             }
478 
479             idx = maxs;
480         }
481 
482         float diff = index - (float) idx;
483         T p1 = data[idx];
484         T p2 = data[idx + 1] - p1;
485         return (p1 + p2 * diff);
486     }
487 
488 #ifndef NDEBUG
489     // Debug facility ; dump the content of the LUT in a file. No control of the filename is done
dump(Glib::ustring fname)490     void dump(Glib::ustring fname)
491     {
492         if (size) {
493             Glib::ustring fname_ = fname + ".xyz"; // TopSolid'Design "plot" file format
494             std::ofstream f (fname_.c_str());
495             f << "$" << std::endl;
496 
497             for (unsigned int iter = 0; iter < size; iter++) {
498                 f << iter << ", " << data[iter] << ", 0." << std::endl;
499             }
500 
501             f << "$" << std::endl;
502             f.close ();
503         }
504     }
505 #endif
506 
507 
508     operator bool (void) const
509     {
510         return size > 0;
511     }
512 
clear(void)513     void clear(void)
514     {
515         if (data && size) {
516             memset(data, 0, size * sizeof(T));
517         }
518     }
519 
reset(void)520     void reset(void)
521     {
522         if (data) {
523             delete[] data;
524         }
525 
526         dirty = true;
527         data = nullptr;
528         owner = 1;
529         size = 0;
530         upperBound = 0;
531         maxs = 0;
532         maxsf = 0.f;
533         clip = 0;
534     }
535 
536     // create an identity LUT (LUT(x) = x) or a scaled identity LUT (LUT(x) = x / divisor)
537     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
538     void makeIdentity(float divisor = 1.f)
539     {
540         if(divisor == 1.f) {
541             for(unsigned int i = 0; i < size; i++) {
542                 data[i] = i;
543             }
544         } else {
545             for(unsigned int i = 0; i < size; i++) {
546                 data[i] = i / divisor;
547             }
548         }
549     }
550 
551     // compress a LUT<uint32_t> with size y into a LUT<uint32_t> with size x (y>x)
552     template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
553     void compressTo(LUT<T> &dest, unsigned int numVals = 0) const
554     {
555         numVals = numVals == 0 ? size : numVals;
556         numVals = std::min(numVals, size);
557         float divisor = numVals - 1;
558         float mult = (dest.size - 1) / divisor;
559 
560         for (unsigned int i = 0; i < numVals; i++) {
561             int hi = (int)(mult * i);
562             dest.data[hi] += this->data[i] ;
563         }
564     }
565 
566     // compress a LUT<uint32_t> with size y into a LUT<uint32_t> with size x (y>x) by using the passTrough LUT to calculate indexes
567     template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
compressTo(LUT<T> & dest,unsigned int numVals,const LUT<float> & passThrough)568     void compressTo(LUT<T> &dest, unsigned int numVals, const LUT<float> &passThrough) const
569     {
570         if(passThrough) {
571             numVals = std::min(numVals, size);
572             numVals = std::min(numVals, passThrough.getSize());
573             float mult = dest.size - 1;
574 
575             for (unsigned int i = 0; i < numVals; i++) {
576                 int hi = (int)(mult * passThrough[i]);
577                 dest[hi] += this->data[i] ;
578             }
579         }
580     }
581 
582     // compute sum and average of a LUT<uint32_t>
583     template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
getSumAndAverage(float & sum,float & avg)584     void getSumAndAverage(float &sum, float &avg) const
585     {
586         sum = 0.f;
587         avg = 0.f;
588         int i = 0;
589 #ifdef __SSE2__
590         vfloat iv = _mm_set_ps(3.f, 2.f, 1.f, 0.f);
591         vfloat fourv = F2V(4.f);
592         vint sumv = (vint)ZEROV;
593         vfloat avgv = ZEROV;
594 
595         for(; i < static_cast<int>(size) - 3; i += 4) {
596             vint datav = _mm_loadu_si128((__m128i*)&data[i]);
597             sumv += datav;
598             avgv += iv * _mm_cvtepi32_ps(datav);
599             iv += fourv;
600 
601         }
602 
603         sum = vhadd(_mm_cvtepi32_ps(sumv));
604         avg = vhadd(avgv);
605 #endif
606 
607         for (; i < static_cast<int>(size); i++) {
608             T val = data[i];
609             sum += val;
610             avg += i * val;
611         }
612 
613         avg /= sum;
614     }
615 
616 
617     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
618     void makeConstant(float value, unsigned int numVals = 0)
619     {
620         numVals = numVals == 0 ? size : numVals;
621         numVals = std::min(numVals, size);
622 
623         for(unsigned int i = 0; i < numVals; i++) {
624             data[i] = value;
625         }
626     }
627 
628     // share the buffer with another LUT, handy for same data but different clip flags
629     void share(const LUT<T> &source, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE)
630     {
631         if (owner && data) {
632             delete[] data;
633         }
634 
635         dirty = false;  // Assumption
636         clip = flags;
637         data = source.data;
638         owner = 0;
639         size = source.getSize();
640         upperBound = size - 1;
641         maxs = size - 2;
642         maxsf = (float)maxs;
643 #ifdef __SSE2__
644         maxsv =  F2V( size - 2);
645         sizeiv =  _mm_set1_epi32( (int)(size - 1) );
646         sizev = F2V( size - 1 );
647 #endif
648     }
649 
650 
651 };
652 
653 #endif /* LUT_H_ */
654