1 /*
2  * LUT.h
3  *  This file is part of RawTherapee.
4  *
5  *  Copyright (c) 2011 Jan Rinze Peterzon (janrinze@gmail.com)
6  *
7  *  RawTherapee is free software: you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation, either version 3 of the License, or
10  *  (at your option) any later version.
11  *
12  *  RawTherapee is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  *
17  *  You should have received a copy of the GNU General Public License
18  *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 /*
22  *  Declaration of flexible Lookup Tables
23  *
24  *  Usage:
25  *
26  *      LUT<type> name (size);
27  *      LUT<type> name (size, flags);
28  *
29  *      creates an array which is valid within the normal C/C++ scope "{ ... }"
30  *
31  *      access to elements is a simple as:
32  *
33  *          LUT<float> my_lut (10);
34  *          float value = my_lut[3];
35  *          float value = my_lut[2.5]; // this will interpolate
36  *
37  *      when using a float type index it will interpolate the lookup values
38  *
39  *      extra setting in flags: (clipping is set by default)
40  *      LUT_CLIP_ABOVE
41  *      LUT_CLIP_BELOW
42  *
43  *      example:
44  *          LUT<float> my_lut (10,LUT_CLIP_BELOW);
45  *          float value = my_lut[22.5];  // this will extrapolate
46  *          float value = my_lut[-22.5]; // this will not extrapolate
47  *
48  *          LUT<float> my_lut (10,0); // this will extrapolate on either side
49  *
50  *      shotcuts:
51  *
52  *          LUTf stands for LUT<float>
53  *          LUTi stands for LUT<int>
54  *          LUTu stands for LUT<unsigned int>
55  *          LUTd stands for LUT<double>
56  *          LUTuc stands for LUT<unsigned char>
57  */
58 
59 #ifndef LUT_H_
60 #define LUT_H_
61 
62 #include <cstring>
63 #include <cstdint>
64 #ifndef NDEBUG
65 #include <cassert>
66 #endif
67 #include "opthelper.h"
68 #include "rt_math.h"
69 
70 // Bit representations of flags
71 enum {
72     LUT_CLIP_OFF,   // LUT does not clip input values
73     LUT_CLIP_BELOW, // LUT clips input values at lower bound
74     LUT_CLIP_ABOVE  // LUT clips input values at upper bound
75 };
76 
77 template<typename T>
78 class LUT;
79 
80 using LUTf = LUT<float>;
81 using LUTi = LUT<int32_t>;
82 using LUTu = LUT<uint32_t>;
83 using LUTd = LUT<double>;
84 using LUTuc = LUT<uint8_t>;
85 
86 template<typename T>
87 class LUT
88 {
89 protected:
90     // list of variables ordered to improve cache speed
91     int maxs;
92     float maxsf;
93     T * data;
94     unsigned int clip;
95     unsigned int size;
96     unsigned int upperBound;  // always equals size-1, parameter created for performance reason
97 private:
98     unsigned int owner;
99 #ifdef __SSE2__
100     alignas(16) vfloat maxsv;
101     alignas(16) vfloat sizev;
102     alignas(16) vint sizeiv;
103 #endif
104 public:
105     /// convenience flag! If one doesn't want to delete the buffer but want to flag it to be recomputed...
106     /// The user have to handle it itself, even if some method can (re)initialize it
107     bool dirty;
108 
109     LUT(int s, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE, bool initZero = false)
110     {
111 #ifndef NDEBUG
112 
113         if (s <= 0) {
114             printf("s<=0!\n");
115         }
116 
117         assert (s > 0);
118 #endif
119         dirty = true;
120         clip = flags;
121         // Add a few extra elements so [](vfloat) won't access out-of-bounds memory.
122         // The routine would still produce the right answer, but might cause issues
123         // with address/heap checking programs.
124         data = new T[s + 3];
125         owner = 1;
126         size = s;
127         upperBound = size - 1;
128         maxs = size - 2;
129         maxsf = (float)maxs;
130 #ifdef __SSE2__
131         maxsv =  F2V( maxs );
132         sizeiv =  _mm_set1_epi32( (int)(size - 1) );
133         sizev = F2V( size - 1 );
134 #endif
135         if (initZero) {
136             clear();
137         }
138     }
operator()139     void operator ()(int s, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE, bool initZero = false)
140     {
141 #ifndef NDEBUG
142 
143         if (s <= 0) {
144             printf("s<=0!\n");
145         }
146 
147         assert (s > 0);
148 #endif
149 
150         if (owner && data) {
151             delete[] data;
152         }
153 
154         dirty = true; // Assumption!
155         clip = flags;
156         // See comment in constructor.
157         data = new T[s + 3];
158         owner = 1;
159         size = s;
160         upperBound = size - 1;
161         maxs = size - 2;
162         maxsf = (float)maxs;
163 #ifdef __SSE2__
164         maxsv =  F2V( maxs );
165         sizeiv =  _mm_set1_epi32( (int)(size - 1) );
166         sizev = F2V( size - 1 );
167 #endif
168         if (initZero) {
169             clear();
170         }
171 
172     }
173 
LUT()174     LUT()
175     {
176         data = nullptr;
177         reset();
178 #ifdef __SSE2__
179         maxsv = ZEROV;
180         sizev = ZEROV;
181         sizeiv = _mm_setzero_si128();
182 #endif
183     }
184 
~LUT()185     ~LUT()
186     {
187         if (owner) {
188             delete[] data;
189 #ifndef NDEBUG
190             data = (T*)0xBAADF00D;
191 #endif
192         }
193     }
194 
setClip(int flags)195     void setClip(int flags)
196     {
197         clip = flags;
198     }
199 
getClip()200     int getClip() const {
201         return clip;
202     }
203 
204     /** @brief Get the number of element in the LUT (i.e. dimension of the array)
205      *  For a LUT(500), it will return 500
206      *  @return number of element in the array
207      */
getSize()208     unsigned int getSize() const
209     {
210         return size;
211     }
212 
213     /** @brief Get the highest value possible (i.e. dimension of the array)
214      *  For a LUT(500), it will return 499, because 500 elements, starting from 0, goes up to 499
215      *  @return number of element in the array
216      */
getUpperBound()217     unsigned int getUpperBound() const
218     {
219         return size > 0 ? upperBound : 0;
220     }
221 
222     LUT<T> & operator=(LUT<T> &rhs)
223     {
224         if (this != &rhs) {
225             if (rhs.size > this->size) {
226                 delete [] this->data;
227                 this->data = nullptr;
228             }
229 
230             if (this->data == nullptr) {
231                 // See comment in constructor.
232                 this->data = new T[rhs.size + 3];
233             }
234 
235             this->clip = rhs.clip;
236             this->owner = 1;
237             memcpy(this->data, rhs.data, rhs.size * sizeof(T));
238             this->size = rhs.size;
239             this->upperBound = rhs.upperBound;
240             this->maxs = this->size - 2;
241             this->maxsf = (float)this->maxs;
242 #ifdef __SSE2__
243             this->maxsv =  F2V( this->size - 2);
244             this->sizeiv =  _mm_set1_epi32( (int)(this->size - 1) );
245             this->sizev = F2V( this->size - 1 );
246 #endif
247         }
248 
249         return *this;
250     }
251 
252     // handy to sum up per thread histograms. #pragma omp simd speeds up the loop by about factor 3 for LUTu (uint32_t).
253     template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
254     LUT<T> & operator+=(LUT<T> &rhs)
255     {
256         if (rhs.size == this->size) {
257 #ifdef _OPENMP
258             #pragma omp simd
259 #endif
260 
261             for(unsigned int i = 0; i < this->size; i++) {
262                 data[i] += rhs.data[i];
263             }
264         }
265 
266         return *this;
267     }
268 
269     // multiply all elements of LUT<float> with a constant float value
270     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
271     LUT<float> & operator*=(float factor)
272     {
273 #ifdef _OPENMP
274         #pragma omp simd
275 #endif
276 
277         for(unsigned int i = 0; i < this->size; i++) {
278             data[i] *= factor;
279         }
280 
281         return *this;
282     }
283 
284     // divide all elements of LUT<float> by a constant float value
285     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
286     LUT<float> & operator/=(float divisor)
287     {
288 #ifdef _OPENMP
289         #pragma omp simd
290 #endif
291 
292         for(unsigned int i = 0; i < this->size; i++) {
293             data[i] /= divisor;
294         }
295 
296         return *this;
297     }
298 
299 
300     // use with integer indices
301     T& operator[](int index) const
302     {
303         return data[ librtprocess::LIM<int>(index, 0, upperBound) ];
304     }
305 
306 #ifdef __SSE2__
307 
308 
309     // NOTE: This function requires LUTs which clips only at lower bound
cb(vfloat indexv)310     vfloat cb(vfloat indexv) const
311     {
312         static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
313 
314         // Clamp and convert to integer values. Extract out of SSE register because all
315         // lookup operations use regular addresses.
316         vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv));
317         vint indexes = _mm_cvttps_epi32(clampedIndexes);
318         int indexArray[4];
319         _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
320 
321         // Load data from the table. This reads more than necessary, but there don't seem
322         // to exist more granular operations (though we could try non-SSE).
323         // Cast to int for convenience in the next operation (partial transpose).
324         vint values[4];
325         for (int i = 0; i < 4; ++i) {
326             values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
327         }
328 
329         // Partial 4x4 transpose operation. We want two new vectors, the first consisting
330         // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
331         __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
332         __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
333         vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
334         vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
335 
336         vfloat diff = vmaxf(ZEROV, indexv) - _mm_cvtepi32_ps(indexes);
337         return vintpf(diff, upper, lower);
338     }
339 
340     // NOTE: This version requires LUTs which clip at upper and lower bounds
341     // (which is the default).
342     vfloat operator[](vfloat indexv) const
343     {
344         static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
345 
346         // Clamp and convert to integer values. Extract out of SSE register because all
347         // lookup operations use regular addresses.
348         vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv));
349         vint indexes = _mm_cvttps_epi32(clampedIndexes);
350         int indexArray[4];
351         _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
352 
353         // Load data from the table. This reads more than necessary, but there don't seem
354         // to exist more granular operations (though we could try non-SSE).
355         // Cast to int for convenience in the next operation (partial transpose).
356         vint values[4];
357         for (int i = 0; i < 4; ++i) {
358             values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
359         }
360 
361         // Partial 4x4 transpose operation. We want two new vectors, the first consisting
362         // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
363         __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
364         __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
365         vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
366         vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
367 
368         vfloat diff = vmaxf(ZEROV, vminf(sizev, indexv)) - _mm_cvtepi32_ps(indexes);
369         return vintpf(diff, upper, lower);
370     }
371 
372     // NOTE: This version requires LUTs which do not clip at upper and lower bounds
operator()373     vfloat operator()(vfloat indexv) const
374     {
375         static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
376 
377         // Clamp and convert to integer values. Extract out of SSE register because all
378         // lookup operations use regular addresses.
379         vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv));
380         vint indexes = _mm_cvttps_epi32(clampedIndexes);
381         int indexArray[4];
382         _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
383 
384         // Load data from the table. This reads more than necessary, but there don't seem
385         // to exist more granular operations (though we could try non-SSE).
386         // Cast to int for convenience in the next operation (partial transpose).
387         vint values[4];
388         for (int i = 0; i < 4; ++i) {
389             values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
390         }
391 
392         // Partial 4x4 transpose operation. We want two new vectors, the first consisting
393         // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
394         __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
395         __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
396         vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
397         vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
398 
399         vfloat diff = indexv - _mm_cvtepi32_ps(indexes);
400         return vintpf(diff, upper, lower);
401     }
402 
403     // vectorized LUT access with integer indices. Clips at lower and upper bounds
404 #ifdef __SSE4_1__
405     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
406     vfloat operator[](vint idxv) const
407     {
408         idxv = _mm_max_epi32( _mm_setzero_si128(), _mm_min_epi32(idxv, sizeiv));
409         // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code
410         return _mm_setr_ps(data[_mm_extract_epi32(idxv,0)], data[_mm_extract_epi32(idxv,1)], data[_mm_extract_epi32(idxv,2)], data[_mm_extract_epi32(idxv,3)]);
411     }
412 #else
413     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
414     vfloat operator[](vint idxv) const
415     {
416         vfloat tempv = vmaxf(ZEROV, vminf(sizev, _mm_cvtepi32_ps(idxv))); // convert to float because SSE2 has no min/max for 32bit integers
417         idxv = _mm_cvttps_epi32(tempv);
418         // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code
419         return _mm_setr_ps(data[_mm_cvtsi128_si32(idxv)],
420                            data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)))],
421                            data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)))],
422                            data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)))]);
423     }
424 #endif
425 #endif
426 
427     // use with float indices
428     template<typename U = T, typename V, typename = typename std::enable_if<std::is_floating_point<V>::value && std::is_same<U, float>::value>::type>
429     T operator[](V index) const
430     {
431         int idx = (int)index;  // don't use floor! The difference in negative space is no problems here
432 
433         if (index < 0.f) {
434             if (clip & LUT_CLIP_BELOW) {
435                 return data[0];
436             }
437 
438             idx = 0;
439         } else if (idx > maxs) {
440             if (clip & LUT_CLIP_ABOVE) {
441                 return data[upperBound];
442             }
443 
444             idx = maxs;
445         }
446 
447         float diff = index - (float) idx;
448         T p1 = data[idx];
449         T p2 = data[idx + 1] - p1;
450         return (p1 + p2 * diff);
451     }
452 
453     // Return the value for "index" that is in the [0-1] range.
454     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
getVal01(float index)455     T getVal01 (float index) const
456     {
457         index *= (float)upperBound;
458         int idx = (int)index;  // don't use floor! The difference in negative space is no problems here
459 
460         if (index < 0.f) {
461             if (clip & LUT_CLIP_BELOW) {
462                 return data[0];
463             }
464 
465             idx = 0;
466         } else if (index > maxsf) {
467             if (clip & LUT_CLIP_ABOVE) {
468                 return data[upperBound];
469             }
470 
471             idx = maxs;
472         }
473 
474         float diff = index - (float) idx;
475         T p1 = data[idx];
476         T p2 = data[idx + 1] - p1;
477         return (p1 + p2 * diff);
478     }
479 
480     operator bool (void) const
481     {
482         return size > 0;
483     }
484 
clear(void)485     void clear(void)
486     {
487         if (data && size) {
488             memset(data, 0, size * sizeof(T));
489         }
490     }
491 
reset(void)492     void reset(void)
493     {
494         if (data) {
495             delete[] data;
496         }
497 
498         dirty = true;
499         data = nullptr;
500         owner = 1;
501         size = 0;
502         upperBound = 0;
503         maxs = 0;
504         maxsf = 0.f;
505         clip = 0;
506     }
507 
508     // create an identity LUT (LUT(x) = x) or a scaled identity LUT (LUT(x) = x / divisor)
509     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
510     void makeIdentity(float divisor = 1.f)
511     {
512         if(divisor == 1.f) {
513             for(unsigned int i = 0; i < size; i++) {
514                 data[i] = i;
515             }
516         } else {
517             for(unsigned int i = 0; i < size; i++) {
518                 data[i] = i / divisor;
519             }
520         }
521     }
522 
523     // compress a LUT<uint32_t> with size y into a LUT<uint32_t> with size x (y>x)
524     template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
525     void compressTo(LUT<T> &dest, unsigned int numVals = 0) const
526     {
527         numVals = numVals == 0 ? size : numVals;
528         numVals = std::min(numVals, size);
529         float divisor = numVals - 1;
530         float mult = (dest.size - 1) / divisor;
531 
532         for (unsigned int i = 0; i < numVals; i++) {
533             int hi = (int)(mult * i);
534             dest.data[hi] += this->data[i] ;
535         }
536     }
537 
538     // compress a LUT<uint32_t> with size y into a LUT<uint32_t> with size x (y>x) by using the passTrough LUT to calculate indexes
539     template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
compressTo(LUT<T> & dest,unsigned int numVals,const LUT<float> & passThrough)540     void compressTo(LUT<T> &dest, unsigned int numVals, const LUT<float> &passThrough) const
541     {
542         if(passThrough) {
543             numVals = std::min(numVals, size);
544             numVals = std::min(numVals, passThrough.getSize());
545             float mult = dest.size - 1;
546 
547             for (unsigned int i = 0; i < numVals; i++) {
548                 int hi = (int)(mult * passThrough[i]);
549                 dest[hi] += this->data[i] ;
550             }
551         }
552     }
553 
554     // compute sum and average of a LUT<uint32_t>
555     template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
getSumAndAverage(float & sum,float & avg)556     void getSumAndAverage(float &sum, float &avg) const
557     {
558         sum = 0.f;
559         avg = 0.f;
560         int i = 0;
561 #ifdef __SSE2__
562         vfloat iv = _mm_set_ps(3.f, 2.f, 1.f, 0.f);
563         vfloat fourv = F2V(4.f);
564         vint sumv = (vint)ZEROV;
565         vfloat avgv = ZEROV;
566 
567         for(; i < static_cast<int>(size) - 3; i += 4) {
568             vint datav = _mm_loadu_si128((__m128i*)&data[i]);
569             sumv += datav;
570             avgv += iv * _mm_cvtepi32_ps(datav);
571             iv += fourv;
572 
573         }
574 
575         sum = vhadd(_mm_cvtepi32_ps(sumv));
576         avg = vhadd(avgv);
577 #endif
578 
579         for (; i < static_cast<int>(size); i++) {
580             T val = data[i];
581             sum += val;
582             avg += i * val;
583         }
584 
585         avg /= sum;
586     }
587 
588 
589     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
590     void makeConstant(float value, unsigned int numVals = 0)
591     {
592         numVals = numVals == 0 ? size : numVals;
593         numVals = std::min(numVals, size);
594 
595         for(unsigned int i = 0; i < numVals; i++) {
596             data[i] = value;
597         }
598     }
599 
600     // share the buffer with another LUT, handy for same data but different clip flags
601     void share(const LUT<T> &source, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE)
602     {
603         if (owner && data) {
604             delete[] data;
605         }
606 
607         dirty = false;  // Assumption
608         clip = flags;
609         data = source.data;
610         owner = 0;
611         size = source.getSize();
612         upperBound = size - 1;
613         maxs = size - 2;
614         maxsf = (float)maxs;
615 #ifdef __SSE2__
616         maxsv =  F2V( size - 2);
617         sizeiv =  _mm_set1_epi32( (int)(size - 1) );
618         sizev = F2V( size - 1 );
619 #endif
620     }
621 
622 
623 };
624 
625 #endif /* LUT_H_ */
626