1 /*
2  * LUT.h
3  *  This file is part of RawTherapee.
4  *
5  *  Copyright (c) 2011 Jan Rinze Peterzon (janrinze@gmail.com)
6  *
7  *  RawTherapee is free software: you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation, either version 3 of the License, or
10  *  (at your option) any later version.
11  *
12  *  RawTherapee is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  *
17  *  You should have received a copy of the GNU General Public License
18  *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 /*
22  *  Declaration of flexible Lookup Tables
23  *
24  *  Usage:
25  *
26  *      LUT<type> name (size);
27  *      LUT<type> name (size, flags);
28  *
29  *      creates an array which is valid within the normal C/C++ scope "{ ... }"
30  *
31  *      access to elements is a simple as:
32  *
33  *          LUT<float> my_lut (10);
34  *          float value = my_lut[3];
35  *          float value = my_lut[2.5]; // this will interpolate
36  *
37  *      when using a float type index it will interpolate the lookup values
38  *
39  *      extra setting in flags: (clipping is set by default)
40  *      LUT_CLIP_ABOVE
41  *      LUT_CLIP_BELOW
42  *
43  *      example:
44  *          LUT<float> my_lut (10,LUT_CLIP_BELOW);
45  *          float value = my_lut[22.5];  // this will extrapolate
46  *          float value = my_lut[-22.5]; // this will not extrapolate
47  *
48  *          LUT<float> my_lut (10,0); // this will extrapolate on either side
49  *
50  *      shotcuts:
51  *
52  *          LUTf stands for LUT<float>
53  *          LUTi stands for LUT<int>
54  *          LUTu stands for LUT<unsigned int>
55  *          LUTd stands for LUT<double>
56  *          LUTuc stands for LUT<unsigned char>
57  */
58 
59 #ifndef LUT_H_
60 #define LUT_H_
61 
62 #include <cstring>
63 #include <cstdint>
64 #ifndef NDEBUG
65 #include <cassert>
66 #endif
67 #include "opthelper.h"
68 #include "rt_math.h"
69 
70 // Bit representations of flags
71 enum {
72     LUT_CLIP_OFF,   // LUT does not clip input values
73     LUT_CLIP_BELOW, // LUT clips input values at lower bound
74     LUT_CLIP_ABOVE  // LUT clips input values at upper bound
75 };
76 
77 template<typename T>
78 class LUT;
79 
80 using LUTf = LUT<float>;
81 using LUTi = LUT<int32_t>;
82 using LUTu = LUT<uint32_t>;
83 using LUTd = LUT<double>;
84 using LUTuc = LUT<uint8_t>;
85 
86 template<typename T>
87 class LUT
88 {
89 protected:
90     // list of variables ordered to improve cache speed
91     int maxs;
92     float maxsf;
93     T * data;
94     unsigned int clip;
95     unsigned int size;
96     unsigned int upperBound;  // always equals size-1, parameter created for performance reason
97 private:
98     unsigned int owner;
99 #ifdef __SSE2__
100     alignas(16) vfloat maxsv;
101     alignas(16) vfloat sizev;
102     alignas(16) vint sizeiv;
103 #endif
104 public:
105     /// convenience flag! If one doesn't want to delete the buffer but want to flag it to be recomputed...
106     /// The user have to handle it itself, even if some method can (re)initialize it
107     bool dirty;
108 
109     LUT(int s, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE, bool initZero = false)
110     {
111         dirty = true;
112         clip = flags;
113         // Add a few extra elements so [](vfloat) won't access out-of-bounds memory.
114         // The routine would still produce the right answer, but might cause issues
115         // with address/heap checking programs.
116         data = new T[s + 3];
117         owner = 1;
118         size = s;
119         upperBound = size - 1;
120         maxs = size - 2;
121         maxsf = (float)maxs;
122 #ifdef __SSE2__
123         maxsv =  F2V( maxs );
124         sizeiv =  _mm_set1_epi32( (int)(size - 1) );
125         sizev = F2V( size - 1 );
126 #endif
127         if (initZero) {
128             clear();
129         }
130     }
operator()131     void operator ()(int s, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE, bool initZero = false)
132     {
133         if (owner && data) {
134             delete[] data;
135         }
136 
137         dirty = true; // Assumption!
138         clip = flags;
139         // See comment in constructor.
140         data = new T[s + 3];
141         owner = 1;
142         size = s;
143         upperBound = size - 1;
144         maxs = size - 2;
145         maxsf = (float)maxs;
146 #ifdef __SSE2__
147         maxsv =  F2V( maxs );
148         sizeiv =  _mm_set1_epi32( (int)(size - 1) );
149         sizev = F2V( size - 1 );
150 #endif
151         if (initZero) {
152             clear();
153         }
154 
155     }
156 
LUT()157     LUT()
158     {
159         data = nullptr;
160         reset();
161 #ifdef __SSE2__
162         maxsv = ZEROV;
163         sizev = ZEROV;
164         sizeiv = _mm_setzero_si128();
165 #endif
166     }
167 
~LUT()168     ~LUT()
169     {
170         if (owner) {
171             delete[] data;
172 #ifndef NDEBUG
173             data = (T*)0xBAADF00D;
174 #endif
175         }
176     }
177 
setClip(int flags)178     void setClip(int flags)
179     {
180         clip = flags;
181     }
182 
getClip()183     int getClip() const {
184         return clip;
185     }
186 
187     /** @brief Get the number of element in the LUT (i.e. dimension of the array)
188      *  For a LUT(500), it will return 500
189      *  @return number of element in the array
190      */
getSize()191     unsigned int getSize() const
192     {
193         return size;
194     }
195 
196     /** @brief Get the highest value possible (i.e. dimension of the array)
197      *  For a LUT(500), it will return 499, because 500 elements, starting from 0, goes up to 499
198      *  @return number of element in the array
199      */
getUpperBound()200     unsigned int getUpperBound() const
201     {
202         return size > 0 ? upperBound : 0;
203     }
204 
205     LUT<T> & operator=(LUT<T> &rhs)
206     {
207         if (this != &rhs) {
208             if (rhs.size > this->size) {
209                 delete [] this->data;
210                 this->data = nullptr;
211             }
212 
213             if (this->data == nullptr) {
214                 // See comment in constructor.
215                 this->data = new T[rhs.size + 3];
216             }
217 
218             this->clip = rhs.clip;
219             this->owner = 1;
220             memcpy(this->data, rhs.data, rhs.size * sizeof(T));
221             this->size = rhs.size;
222             this->upperBound = rhs.upperBound;
223             this->maxs = this->size - 2;
224             this->maxsf = (float)this->maxs;
225 #ifdef __SSE2__
226             this->maxsv =  F2V( this->size - 2);
227             this->sizeiv =  _mm_set1_epi32( (int)(this->size - 1) );
228             this->sizev = F2V( this->size - 1 );
229 #endif
230         }
231 
232         return *this;
233     }
234 
235     // handy to sum up per thread histograms. #pragma omp simd speeds up the loop by about factor 3 for LUTu (uint32_t).
236     template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
237     LUT<T> & operator+=(LUT<T> &rhs)
238     {
239         if (rhs.size == this->size) {
240 #ifdef _OPENMP
241             #pragma omp simd
242 #endif
243 
244             for(unsigned int i = 0; i < this->size; i++) {
245                 data[i] += rhs.data[i];
246             }
247         }
248 
249         return *this;
250     }
251 
252     // multiply all elements of LUT<float> with a constant float value
253     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
254     LUT<float> & operator*=(float factor)
255     {
256 #ifdef _OPENMP
257         #pragma omp simd
258 #endif
259 
260         for(unsigned int i = 0; i < this->size; i++) {
261             data[i] *= factor;
262         }
263 
264         return *this;
265     }
266 
267     // divide all elements of LUT<float> by a constant float value
268     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
269     LUT<float> & operator/=(float divisor)
270     {
271 #ifdef _OPENMP
272         #pragma omp simd
273 #endif
274 
275         for(unsigned int i = 0; i < this->size; i++) {
276             data[i] /= divisor;
277         }
278 
279         return *this;
280     }
281 
282 
283     // use with integer indices
284     T& operator[](int index) const
285     {
286         return data[ librtprocess::LIM<int>(index, 0, upperBound) ];
287     }
288 
289 #ifdef __SSE2__
290 
291 
292     // NOTE: This function requires LUTs which clips only at lower bound
cb(vfloat indexv)293     vfloat cb(vfloat indexv) const
294     {
295         static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
296 
297         // Clamp and convert to integer values. Extract out of SSE register because all
298         // lookup operations use regular addresses.
299         vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv));
300         vint indexes = _mm_cvttps_epi32(clampedIndexes);
301         int indexArray[4];
302         _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
303 
304         // Load data from the table. This reads more than necessary, but there don't seem
305         // to exist more granular operations (though we could try non-SSE).
306         // Cast to int for convenience in the next operation (partial transpose).
307         vint values[4];
308         for (int i = 0; i < 4; ++i) {
309             values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
310         }
311 
312         // Partial 4x4 transpose operation. We want two new vectors, the first consisting
313         // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
314         __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
315         __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
316         vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
317         vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
318 
319         vfloat diff = vmaxf(ZEROV, indexv) - _mm_cvtepi32_ps(indexes);
320         return vintpf(diff, upper, lower);
321     }
322 
323     // NOTE: This version requires LUTs which clip at upper and lower bounds
324     // (which is the default).
325     vfloat operator[](vfloat indexv) const
326     {
327         static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
328 
329         // Clamp and convert to integer values. Extract out of SSE register because all
330         // lookup operations use regular addresses.
331         vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv));
332         vint indexes = _mm_cvttps_epi32(clampedIndexes);
333         int indexArray[4];
334         _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
335 
336         // Load data from the table. This reads more than necessary, but there don't seem
337         // to exist more granular operations (though we could try non-SSE).
338         // Cast to int for convenience in the next operation (partial transpose).
339         vint values[4];
340         for (int i = 0; i < 4; ++i) {
341             values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
342         }
343 
344         // Partial 4x4 transpose operation. We want two new vectors, the first consisting
345         // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
346         __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
347         __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
348         vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
349         vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
350 
351         vfloat diff = vmaxf(ZEROV, vminf(sizev, indexv)) - _mm_cvtepi32_ps(indexes);
352         return vintpf(diff, upper, lower);
353     }
354 
355     // NOTE: This version requires LUTs which do not clip at upper and lower bounds
operator()356     vfloat operator()(vfloat indexv) const
357     {
358         static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
359 
360         // Clamp and convert to integer values. Extract out of SSE register because all
361         // lookup operations use regular addresses.
362         vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv));
363         vint indexes = _mm_cvttps_epi32(clampedIndexes);
364         int indexArray[4];
365         _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
366 
367         // Load data from the table. This reads more than necessary, but there don't seem
368         // to exist more granular operations (though we could try non-SSE).
369         // Cast to int for convenience in the next operation (partial transpose).
370         vint values[4];
371         for (int i = 0; i < 4; ++i) {
372             values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
373         }
374 
375         // Partial 4x4 transpose operation. We want two new vectors, the first consisting
376         // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
377         __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
378         __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
379         vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
380         vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
381 
382         vfloat diff = indexv - _mm_cvtepi32_ps(indexes);
383         return vintpf(diff, upper, lower);
384     }
385 
386     // vectorized LUT access with integer indices. Clips at lower and upper bounds
387 #ifdef __SSE4_1__
388     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
389     vfloat operator[](vint idxv) const
390     {
391         idxv = _mm_max_epi32( _mm_setzero_si128(), _mm_min_epi32(idxv, sizeiv));
392         // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code
393         return _mm_setr_ps(data[_mm_extract_epi32(idxv,0)], data[_mm_extract_epi32(idxv,1)], data[_mm_extract_epi32(idxv,2)], data[_mm_extract_epi32(idxv,3)]);
394     }
395 #else
396     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
397     vfloat operator[](vint idxv) const
398     {
399         vfloat tempv = vmaxf(ZEROV, vminf(sizev, _mm_cvtepi32_ps(idxv))); // convert to float because SSE2 has no min/max for 32bit integers
400         idxv = _mm_cvttps_epi32(tempv);
401         // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code
402         return _mm_setr_ps(data[_mm_cvtsi128_si32(idxv)],
403                            data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)))],
404                            data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)))],
405                            data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)))]);
406     }
407 #endif
408 #endif
409 
410     // use with float indices
411     template<typename U = T, typename V, typename = typename std::enable_if<std::is_floating_point<V>::value && std::is_same<U, float>::value>::type>
412     T operator[](V index) const
413     {
414         int idx = (int)index;  // don't use floor! The difference in negative space is no problems here
415 
416         if (index < 0.f) {
417             if (clip & LUT_CLIP_BELOW) {
418                 return data[0];
419             }
420 
421             idx = 0;
422         } else if (idx > maxs) {
423             if (clip & LUT_CLIP_ABOVE) {
424                 return data[upperBound];
425             }
426 
427             idx = maxs;
428         }
429 
430         float diff = index - (float) idx;
431         T p1 = data[idx];
432         T p2 = data[idx + 1] - p1;
433         return (p1 + p2 * diff);
434     }
435 
436     // Return the value for "index" that is in the [0-1] range.
437     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
getVal01(float index)438     T getVal01 (float index) const
439     {
440         index *= (float)upperBound;
441         int idx = (int)index;  // don't use floor! The difference in negative space is no problems here
442 
443         if (index < 0.f) {
444             if (clip & LUT_CLIP_BELOW) {
445                 return data[0];
446             }
447 
448             idx = 0;
449         } else if (index > maxsf) {
450             if (clip & LUT_CLIP_ABOVE) {
451                 return data[upperBound];
452             }
453 
454             idx = maxs;
455         }
456 
457         float diff = index - (float) idx;
458         T p1 = data[idx];
459         T p2 = data[idx + 1] - p1;
460         return (p1 + p2 * diff);
461     }
462 
463     operator bool (void) const
464     {
465         return size > 0;
466     }
467 
clear(void)468     void clear(void)
469     {
470         if (data && size) {
471             memset(data, 0, size * sizeof(T));
472         }
473     }
474 
reset(void)475     void reset(void)
476     {
477         if (data) {
478             delete[] data;
479         }
480 
481         dirty = true;
482         data = nullptr;
483         owner = 1;
484         size = 0;
485         upperBound = 0;
486         maxs = 0;
487         maxsf = 0.f;
488         clip = 0;
489     }
490 
491     // create an identity LUT (LUT(x) = x) or a scaled identity LUT (LUT(x) = x / divisor)
492     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
493     void makeIdentity(float divisor = 1.f)
494     {
495         if(divisor == 1.f) {
496             for(unsigned int i = 0; i < size; i++) {
497                 data[i] = i;
498             }
499         } else {
500             for(unsigned int i = 0; i < size; i++) {
501                 data[i] = i / divisor;
502             }
503         }
504     }
505 
506     // compress a LUT<uint32_t> with size y into a LUT<uint32_t> with size x (y>x)
507     template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
508     void compressTo(LUT<T> &dest, unsigned int numVals = 0) const
509     {
510         numVals = numVals == 0 ? size : numVals;
511         numVals = std::min(numVals, size);
512         float divisor = numVals - 1;
513         float mult = (dest.size - 1) / divisor;
514 
515         for (unsigned int i = 0; i < numVals; i++) {
516             int hi = (int)(mult * i);
517             dest.data[hi] += this->data[i] ;
518         }
519     }
520 
521     // compress a LUT<uint32_t> with size y into a LUT<uint32_t> with size x (y>x) by using the passTrough LUT to calculate indexes
522     template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
compressTo(LUT<T> & dest,unsigned int numVals,const LUT<float> & passThrough)523     void compressTo(LUT<T> &dest, unsigned int numVals, const LUT<float> &passThrough) const
524     {
525         if(passThrough) {
526             numVals = std::min(numVals, size);
527             numVals = std::min(numVals, passThrough.getSize());
528             float mult = dest.size - 1;
529 
530             for (unsigned int i = 0; i < numVals; i++) {
531                 int hi = (int)(mult * passThrough[i]);
532                 dest[hi] += this->data[i] ;
533             }
534         }
535     }
536 
537     // compute sum and average of a LUT<uint32_t>
538     template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
getSumAndAverage(float & sum,float & avg)539     void getSumAndAverage(float &sum, float &avg) const
540     {
541         sum = 0.f;
542         avg = 0.f;
543         int i = 0;
544 #ifdef __SSE2__
545         vfloat iv = _mm_set_ps(3.f, 2.f, 1.f, 0.f);
546         vfloat fourv = F2V(4.f);
547         vint sumv = (vint)ZEROV;
548         vfloat avgv = ZEROV;
549 
550         for(; i < static_cast<int>(size) - 3; i += 4) {
551             vint datav = _mm_loadu_si128((__m128i*)&data[i]);
552             sumv += datav;
553             avgv += iv * _mm_cvtepi32_ps(datav);
554             iv += fourv;
555 
556         }
557 
558         sum = vhadd(_mm_cvtepi32_ps(sumv));
559         avg = vhadd(avgv);
560 #endif
561 
562         for (; i < static_cast<int>(size); i++) {
563             T val = data[i];
564             sum += val;
565             avg += i * val;
566         }
567 
568         avg /= sum;
569     }
570 
571 
572     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
573     void makeConstant(float value, unsigned int numVals = 0)
574     {
575         numVals = numVals == 0 ? size : numVals;
576         numVals = std::min(numVals, size);
577 
578         for(unsigned int i = 0; i < numVals; i++) {
579             data[i] = value;
580         }
581     }
582 
583     // share the buffer with another LUT, handy for same data but different clip flags
584     void share(const LUT<T> &source, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE)
585     {
586         if (owner && data) {
587             delete[] data;
588         }
589 
590         dirty = false;  // Assumption
591         clip = flags;
592         data = source.data;
593         owner = 0;
594         size = source.getSize();
595         upperBound = size - 1;
596         maxs = size - 2;
597         maxsf = (float)maxs;
598 #ifdef __SSE2__
599         maxsv =  F2V( size - 2);
600         sizeiv =  _mm_set1_epi32( (int)(size - 1) );
601         sizev = F2V( size - 1 );
602 #endif
603     }
604 
605 
606 };
607 
608 #endif /* LUT_H_ */
609