1 /*
2  * LUT.h
3  *  This file is part of RawTherapee.
4  *
5  *  Copyright (c) 2011 Jan Rinze Peterzon (janrinze@gmail.com)
6  *
7  *  RawTherapee is free software: you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation, either version 3 of the License, or
10  *  (at your option) any later version.
11  *
12  *  RawTherapee is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  *
17  *  You should have received a copy of the GNU General Public License
18  *  along with RawTherapee.  If not, see <https://www.gnu.org/licenses/>.
19  */
20 
21 /*
22  *  Declaration of flexible Lookup Tables
23  *
24  *  Usage:
25  *
26  *      LUT<type> name (size);
27  *      LUT<type> name (size, flags);
28  *
29  *      creates an array which is valid within the normal C/C++ scope "{ ... }"
30  *
31  *      access to elements is a simple as:
32  *
33  *          LUT<float> my_lut (10);
34  *          float value = my_lut[3];
35  *          float value = my_lut[2.5]; // this will interpolate
36  *
37  *      when using a float type index it will interpolate the lookup values
38  *
39  *      extra setting in flags: (clipping is set by default)
40  *      LUT_CLIP_ABOVE
41  *      LUT_CLIP_BELOW
42  *
43  *      example:
44  *          LUT<float> my_lut (10,LUT_CLIP_BELOW);
45  *          float value = my_lut[22.5];  // this will extrapolate
46  *          float value = my_lut[-22.5]; // this will not extrapolate
47  *
48  *          LUT<float> my_lut (10,0); // this will extrapolate on either side
49  *
50  *      shotcuts:
51  *
52  *          LUTf stands for LUT<float>
53  *          LUTi stands for LUT<int>
54  *          LUTu stands for LUT<unsigned int>
55  *          LUTd stands for LUT<double>
56  *          LUTuc stands for LUT<unsigned char>
57  */
58 
59 #pragma once
60 
61 #include <cstring>
62 #include <cstdint>
63 #include <cassert>
64 
65 #ifndef NDEBUG
66 #include <fstream>
67 #endif
68 
69 #include "opthelper.h"
70 #include "rt_math.h"
71 
72 // Bit representations of flags
73 enum {
74     LUT_CLIP_OFF,   // LUT does not clip input values
75     LUT_CLIP_BELOW, // LUT clips input values at lower bound
76     LUT_CLIP_ABOVE  // LUT clips input values at upper bound
77 };
78 
79 template<typename T>
80 class LUT;
81 
82 using LUTf = LUT<float>;
83 using LUTi = LUT<int32_t>;
84 using LUTu = LUT<uint32_t>;
85 using LUTd = LUT<double>;
86 using LUTuc = LUT<uint8_t>;
87 
88 template<typename T>
89 class LUT
90 {
91 protected:
92     // list of variables ordered to improve cache speed
93     int maxs;
94     float maxsf;
95     T * data;
96     unsigned int clip;
97     unsigned int size;
98     unsigned int upperBound;  // always equals size-1, parameter created for performance reason
99 private:
100     unsigned int owner;
101 #ifdef __SSE2__
102     alignas(16) vfloat maxsv;
103     alignas(16) vfloat sizev;
104     alignas(16) vint sizeiv;
105 #endif
106 public:
107     /// convenience flag! If one doesn't want to delete the buffer but want to flag it to be recomputed...
108     /// The user have to handle it itself, even if some method can (re)initialize it
109     bool dirty;
110 
111     LUT(int s, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE, bool initZero = false)
112     {
113 #ifndef NDEBUG
114 
115         if (s <= 0) {
116             printf("s<=0!\n");
117         }
118 
119         assert (s > 0);
120 #endif
121         dirty = true;
122         clip = flags;
123         // Add a few extra elements so [](vfloat) won't access out-of-bounds memory.
124         // The routine would still produce the right answer, but might cause issues
125         // with address/heap checking programs.
126         data = new T[s + 3];
127         owner = 1;
128         size = s;
129         upperBound = size - 1;
130         maxs = size - 2;
131         maxsf = (float)maxs;
132 #ifdef __SSE2__
133         maxsv =  F2V( maxs );
134         sizeiv =  _mm_set1_epi32( (int)(size - 1) );
135         sizev = F2V( size - 1 );
136 #endif
137         if (initZero) {
138             clear();
139         }
140     }
operator()141     void operator ()(int s, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE, bool initZero = false)
142     {
143 #ifndef NDEBUG
144 
145         if (s <= 0) {
146             printf("s<=0!\n");
147         }
148 
149         assert (s > 0);
150 #endif
151 
152         if (owner && data) {
153             delete[] data;
154         }
155 
156         dirty = true; // Assumption!
157         clip = flags;
158         // See comment in constructor.
159         data = new T[s + 3];
160         owner = 1;
161         size = s;
162         upperBound = size - 1;
163         maxs = size - 2;
164         maxsf = (float)maxs;
165 #ifdef __SSE2__
166         maxsv =  F2V( maxs );
167         sizeiv =  _mm_set1_epi32( (int)(size - 1) );
168         sizev = F2V( size - 1 );
169 #endif
170         if (initZero) {
171             clear();
172         }
173 
174     }
175 
LUT()176     LUT()
177     {
178         data = nullptr;
179         reset();
180 #ifdef __SSE2__
181         maxsv = ZEROV;
182         sizev = ZEROV;
183         sizeiv = _mm_setzero_si128();
184 #endif
185     }
186 
~LUT()187     ~LUT()
188     {
189         if (owner) {
190             delete[] data;
191 #ifndef NDEBUG
192             data = (T*)0xBAADF00D;
193 #endif
194         }
195     }
196 
197     explicit LUT(const LUT&) = delete;
198 
setClip(int flags)199     void setClip(int flags)
200     {
201         clip = flags;
202     }
203 
getClip()204     int getClip() const {
205         return clip;
206     }
207 
208     /** @brief Get the number of element in the LUT (i.e. dimension of the array)
209      *  For a LUT(500), it will return 500
210      *  @return number of element in the array
211      */
getSize()212     unsigned int getSize() const
213     {
214         return size;
215     }
216 
217     /** @brief Get the highest value possible (i.e. dimension of the array)
218      *  For a LUT(500), it will return 499, because 500 elements, starting from 0, goes up to 499
219      *  @return number of element in the array
220      */
getUpperBound()221     unsigned int getUpperBound() const
222     {
223         return size > 0 ? upperBound : 0;
224     }
225 
226     LUT<T> & operator=(const LUT<T>& rhs)
227     {
228         if (this != &rhs) {
229             if (rhs.size > this->size) {
230                 delete [] this->data;
231                 this->data = nullptr;
232             }
233 
234             if (this->data == nullptr) {
235                 // See comment in constructor.
236                 this->data = new T[rhs.size + 3];
237             }
238 
239             this->clip = rhs.clip;
240             this->owner = 1;
241             memcpy(this->data, rhs.data, rhs.size * sizeof(T));
242             this->size = rhs.size;
243             this->upperBound = rhs.upperBound;
244             this->maxs = this->size - 2;
245             this->maxsf = (float)this->maxs;
246 #ifdef __SSE2__
247             this->maxsv =  F2V( this->size - 2);
248             this->sizeiv =  _mm_set1_epi32( (int)(this->size - 1) );
249             this->sizev = F2V( this->size - 1 );
250 #endif
251         }
252 
253         return *this;
254     }
255 
256     // handy to sum up per thread histograms. #pragma omp simd speeds up the loop by about factor 3 for LUTu (uint32_t).
257     template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
258     LUT<T> & operator+=(const LUT<T>& rhs)
259     {
260         if (rhs.size == this->size) {
261 #ifdef _OPENMP
262             #pragma omp simd
263 #endif
264 
265             for(unsigned int i = 0; i < this->size; i++) {
266                 data[i] += rhs.data[i];
267             }
268         }
269 
270         return *this;
271     }
272 
273     // multiply all elements of LUT<float> with a constant float value
274     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
275     LUT<float> & operator*=(float factor)
276     {
277 #ifdef _OPENMP
278         #pragma omp simd
279 #endif
280 
281         for(unsigned int i = 0; i < this->size; i++) {
282             data[i] *= factor;
283         }
284 
285         return *this;
286     }
287 
288     // divide all elements of LUT<float> by a constant float value
289     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
290     LUT<float> & operator/=(float divisor)
291     {
292 #ifdef _OPENMP
293         #pragma omp simd
294 #endif
295 
296         for(unsigned int i = 0; i < this->size; i++) {
297             data[i] /= divisor;
298         }
299 
300         return *this;
301     }
302 
303 
304     // use with integer indices
305     T& operator[](int index) const
306     {
307         return data[ rtengine::LIM<int>(index, 0, upperBound) ];
308     }
309 
310 #ifdef __SSE2__
311 
312 
313     // NOTE: This function requires LUTs which clips only at lower bound
cb(vfloat indexv)314     vfloat cb(vfloat indexv) const
315     {
316         static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
317 
318         // Clamp and convert to integer values. Extract out of SSE register because all
319         // lookup operations use regular addresses.
320         vfloat clampedIndexes = vclampf(indexv, ZEROV, maxsv); // this automagically uses ZEROV in case indexv is NaN
321         vint indexes = _mm_cvttps_epi32(clampedIndexes);
322         int indexArray[4];
323         _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
324 
325         // Load data from the table. This reads more than necessary, but there don't seem
326         // to exist more granular operations (though we could try non-SSE).
327         // Cast to int for convenience in the next operation (partial transpose).
328         vint values[4];
329         for (int i = 0; i < 4; ++i) {
330             values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
331         }
332 
333         // Partial 4x4 transpose operation. We want two new vectors, the first consisting
334         // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
335         __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
336         __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
337         vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
338         vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
339 
340         vfloat diff = vmaxf(ZEROV, indexv) - _mm_cvtepi32_ps(indexes);
341         return vintpf(diff, upper, lower);
342     }
343 
344     // NOTE: This version requires LUTs which clip at upper and lower bounds
345     // (which is the default).
346     vfloat operator[](vfloat indexv) const
347     {
348         static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
349 
350         // Clamp and convert to integer values. Extract out of SSE register because all
351         // lookup operations use regular addresses.
352         vfloat clampedIndexes = vclampf(indexv, ZEROV, maxsv); // this automagically uses ZEROV in case indexv is NaN
353         vint indexes = _mm_cvttps_epi32(clampedIndexes);
354         int indexArray[4];
355         _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
356 
357         // Load data from the table. This reads more than necessary, but there don't seem
358         // to exist more granular operations (though we could try non-SSE).
359         // Cast to int for convenience in the next operation (partial transpose).
360         vint values[4];
361         for (int i = 0; i < 4; ++i) {
362             values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
363         }
364 
365         // Partial 4x4 transpose operation. We want two new vectors, the first consisting
366         // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
367         __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
368         __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
369         vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
370         vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
371 
372         vfloat diff = vclampf(indexv, ZEROV, sizev) - _mm_cvtepi32_ps(indexes); // this automagically uses ZEROV in case indexv is NaN
373         return vintpf(diff, upper, lower);
374     }
375 
376     // NOTE: This version requires LUTs which do not clip at upper and lower bounds
operator()377     vfloat operator()(vfloat indexv) const
378     {
379         static_assert(std::is_same<T, float>::value, "This method only works for float LUTs");
380 
381         // Clamp and convert to integer values. Extract out of SSE register because all
382         // lookup operations use regular addresses.
383         vfloat clampedIndexes = vclampf(indexv, ZEROV, maxsv); // this automagically uses ZEROV in case indexv is NaN
384         vint indexes = _mm_cvttps_epi32(clampedIndexes);
385         int indexArray[4];
386         _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes);
387 
388         // Load data from the table. This reads more than necessary, but there don't seem
389         // to exist more granular operations (though we could try non-SSE).
390         // Cast to int for convenience in the next operation (partial transpose).
391         vint values[4];
392         for (int i = 0; i < 4; ++i) {
393             values[i] = _mm_castps_si128(LVFU(data[indexArray[i]]));
394         }
395 
396         // Partial 4x4 transpose operation. We want two new vectors, the first consisting
397         // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]].
398         __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]);
399         __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]);
400         vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1));
401         vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1));
402 
403         vfloat diff = indexv - _mm_cvtepi32_ps(indexes);
404         return vintpf(diff, upper, lower);
405     }
406 
407     // vectorized LUT access with integer indices. Clips at lower and upper bounds
408 #ifdef __SSE4_1__
409     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
410     vfloat operator[](vint idxv) const
411     {
412         idxv = _mm_max_epi32( _mm_setzero_si128(), _mm_min_epi32(idxv, sizeiv));
413         // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code
414         return _mm_setr_ps(data[_mm_extract_epi32(idxv,0)], data[_mm_extract_epi32(idxv,1)], data[_mm_extract_epi32(idxv,2)], data[_mm_extract_epi32(idxv,3)]);
415     }
416 #else
417     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
418     vfloat operator[](vint idxv) const
419     {
420         // convert to float because SSE2 has no min/max for 32bit integers
421         vfloat tempv = vclampf(_mm_cvtepi32_ps(idxv), ZEROV, sizev); // this automagically uses ZEROV in case idxv is NaN (which will never happen because it is a vector of int)
422         idxv = _mm_cvttps_epi32(tempv);
423         // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code
424         return _mm_setr_ps(data[_mm_cvtsi128_si32(idxv)],
425                            data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)))],
426                            data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)))],
427                            data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)))]);
428     }
429 #endif
430 #endif
431 
432     // use with float indices
433     template<typename U = T, typename V, typename = typename std::enable_if<std::is_floating_point<V>::value && std::is_same<U, float>::value>::type>
434     T operator[](V index) const
435     {
436         int idx = (int)index;  // don't use floor! The difference in negative space is no problems here
437 
438         if (index < 0.f) {
439             if (clip & LUT_CLIP_BELOW) {
440                 return data[0];
441             }
442 
443             idx = 0;
444         } else if (index > maxsf) {
445             if (clip & LUT_CLIP_ABOVE) {
446                 return data[upperBound];
447             }
448 
449             idx = maxs;
450         }
451 
452         float diff = index - (float) idx;
453         T p1 = data[idx];
454         T p2 = data[idx + 1] - p1;
455         return (p1 + p2 * diff);
456     }
457 
458     // Return the value for "index" that is in the [0-1] range.
459     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
getVal01(float index)460     T getVal01 (float index) const
461     {
462         index *= (float)upperBound;
463         int idx = (int)index;  // don't use floor! The difference in negative space is no problems here
464 
465         if (index < 0.f) {
466             if (clip & LUT_CLIP_BELOW) {
467                 return data[0];
468             }
469 
470             idx = 0;
471         } else if (index > maxsf) {
472             if (clip & LUT_CLIP_ABOVE) {
473                 return data[upperBound];
474             }
475 
476             idx = maxs;
477         }
478 
479         float diff = index - (float) idx;
480         T p1 = data[idx];
481         T p2 = data[idx + 1] - p1;
482         return (p1 + p2 * diff);
483     }
484 
485     operator bool (void) const
486     {
487         return size > 0;
488     }
489 
clear(void)490     void clear(void)
491     {
492         if (data && size) {
493             memset(data, 0, size * sizeof(T));
494         }
495     }
496 
reset(void)497     void reset(void)
498     {
499         if (data) {
500             delete[] data;
501         }
502 
503         dirty = true;
504         data = nullptr;
505         owner = 1;
506         size = 0;
507         upperBound = 0;
508         maxs = 0;
509         maxsf = 0.f;
510         clip = 0;
511     }
512 
513     // create an identity LUT (LUT(x) = x) or a scaled identity LUT (LUT(x) = x / divisor)
514     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
515     void makeIdentity(float divisor = 1.f)
516     {
517         if(divisor == 1.f) {
518             for(unsigned int i = 0; i < size; i++) {
519                 data[i] = i;
520             }
521         } else {
522             for(unsigned int i = 0; i < size; i++) {
523                 data[i] = i / divisor;
524             }
525         }
526     }
527 
528     // compress a LUT<uint32_t> with size y into a LUT<uint32_t> with size x (y>x)
529     template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
530     void compressTo(LUT<T> &dest, unsigned int numVals = 0) const
531     {
532         numVals = numVals == 0 ? size : numVals;
533         numVals = std::min(numVals, size);
534         float divisor = numVals - 1;
535         float mult = (dest.size - 1) / divisor;
536 
537         for (unsigned int i = 0; i < numVals; i++) {
538             int hi = (int)(mult * i);
539             dest.data[hi] += this->data[i] ;
540         }
541     }
542 
543     // compress a LUT<uint32_t> with size y into a LUT<uint32_t> with size x (y>x) by using the passThrough LUT to calculate indexes
544     template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
compressTo(LUT<T> & dest,unsigned int numVals,const LUT<float> & passThrough)545     void compressTo(LUT<T> &dest, unsigned int numVals, const LUT<float> &passThrough) const
546     {
547         if(passThrough) {
548             numVals = std::min(numVals, size);
549             numVals = std::min(numVals, passThrough.getSize());
550             float mult = dest.size - 1;
551 
552             for (unsigned int i = 0; i < numVals; i++) {
553                 int hi = (int)(mult * passThrough[i]);
554                 dest[hi] += this->data[i] ;
555             }
556         }
557     }
558 
559     // compute sum and average of a LUT<uint32_t>
560     template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type>
getSumAndAverage(float & sum,float & avg)561     void getSumAndAverage(float &sum, float &avg) const
562     {
563         sum = 0.f;
564         avg = 0.f;
565         int i = 0;
566 #ifdef __SSE2__
567         vfloat iv = _mm_set_ps(3.f, 2.f, 1.f, 0.f);
568         vfloat fourv = F2V(4.f);
569         vint sumv = (vint)ZEROV;
570         vfloat avgv = ZEROV;
571 
572         for(; i < static_cast<int>(size) - 3; i += 4) {
573             vint datav = _mm_loadu_si128((__m128i*)&data[i]);
574             sumv += datav;
575             avgv += iv * _mm_cvtepi32_ps(datav);
576             iv += fourv;
577 
578         }
579 
580         sum = vhadd(_mm_cvtepi32_ps(sumv));
581         avg = vhadd(avgv);
582 #endif
583 
584         for (; i < static_cast<int>(size); i++) {
585             T val = data[i];
586             sum += val;
587             avg += i * val;
588         }
589 
590         avg /= sum;
591     }
592 
593 
594     template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type>
595     void makeConstant(float value, unsigned int numVals = 0)
596     {
597         numVals = numVals == 0 ? size : numVals;
598         numVals = std::min(numVals, size);
599 
600         for(unsigned int i = 0; i < numVals; i++) {
601             data[i] = value;
602         }
603     }
604 
605     // share the buffer with another LUT, handy for same data but different clip flags
606     void share(const LUT<T> &source, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE)
607     {
608         if (owner && data) {
609             delete[] data;
610         }
611 
612         dirty = false;  // Assumption
613         clip = flags;
614         data = source.data;
615         owner = 0;
616         size = source.getSize();
617         upperBound = size - 1;
618         maxs = size - 2;
619         maxsf = (float)maxs;
620 #ifdef __SSE2__
621         maxsv =  F2V( size - 2);
622         sizeiv =  _mm_set1_epi32( (int)(size - 1) );
623         sizev = F2V( size - 1 );
624 #endif
625     }
626 
627 
628 };
629