1 /* 2 * LUT.h 3 * This file is part of RawTherapee. 4 * 5 * Copyright (c) 2011 Jan Rinze Peterzon (janrinze@gmail.com) 6 * 7 * RawTherapee is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * RawTherapee is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with RawTherapee. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 21 /* 22 * Declaration of flexible Lookup Tables 23 * 24 * Usage: 25 * 26 * LUT<type> name (size); 27 * LUT<type> name (size, flags); 28 * 29 * creates an array which is valid within the normal C/C++ scope "{ ... }" 30 * 31 * access to elements is a simple as: 32 * 33 * LUT<float> my_lut (10); 34 * float value = my_lut[3]; 35 * float value = my_lut[2.5]; // this will interpolate 36 * 37 * when using a float type index it will interpolate the lookup values 38 * 39 * extra setting in flags: (clipping is set by default) 40 * LUT_CLIP_ABOVE 41 * LUT_CLIP_BELOW 42 * 43 * example: 44 * LUT<float> my_lut (10,LUT_CLIP_BELOW); 45 * float value = my_lut[22.5]; // this will extrapolate 46 * float value = my_lut[-22.5]; // this will not extrapolate 47 * 48 * LUT<float> my_lut (10,0); // this will extrapolate on either side 49 * 50 * shotcuts: 51 * 52 * LUTf stands for LUT<float> 53 * LUTi stands for LUT<int> 54 * LUTu stands for LUT<unsigned int> 55 * LUTd stands for LUT<double> 56 * LUTuc stands for LUT<unsigned char> 57 */ 58 59 #ifndef LUT_H_ 60 #define LUT_H_ 61 62 #include <cstring> 63 #include <cstdint> 64 #ifndef NDEBUG 65 #include <cassert> 66 #endif 67 #include "opthelper.h" 68 #include "rt_math.h" 69 70 // Bit representations of flags 71 enum { 72 LUT_CLIP_OFF, // LUT does not clip input values 73 LUT_CLIP_BELOW, // LUT clips input values at lower bound 74 LUT_CLIP_ABOVE // LUT clips input values at upper bound 75 }; 76 77 template<typename T> 78 class LUT; 79 80 using LUTf = LUT<float>; 81 using LUTi = LUT<int32_t>; 82 using LUTu = LUT<uint32_t>; 83 using LUTd = LUT<double>; 84 using LUTuc = LUT<uint8_t>; 85 86 template<typename T> 87 class LUT 88 { 89 protected: 90 // list of variables ordered to improve cache speed 91 int maxs; 92 float maxsf; 93 T * data; 94 unsigned int clip; 95 unsigned int size; 96 unsigned int upperBound; // always equals size-1, parameter created for performance reason 97 private: 98 unsigned int owner; 99 #ifdef __SSE2__ 100 alignas(16) vfloat maxsv; 101 alignas(16) vfloat sizev; 102 alignas(16) vint sizeiv; 103 #endif 104 public: 105 /// convenience flag! If one doesn't want to delete the buffer but want to flag it to be recomputed... 106 /// The user have to handle it itself, even if some method can (re)initialize it 107 bool dirty; 108 109 LUT(int s, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE, bool initZero = false) 110 { 111 dirty = true; 112 clip = flags; 113 // Add a few extra elements so [](vfloat) won't access out-of-bounds memory. 114 // The routine would still produce the right answer, but might cause issues 115 // with address/heap checking programs. 116 data = new T[s + 3]; 117 owner = 1; 118 size = s; 119 upperBound = size - 1; 120 maxs = size - 2; 121 maxsf = (float)maxs; 122 #ifdef __SSE2__ 123 maxsv = F2V( maxs ); 124 sizeiv = _mm_set1_epi32( (int)(size - 1) ); 125 sizev = F2V( size - 1 ); 126 #endif 127 if (initZero) { 128 clear(); 129 } 130 } operator()131 void operator ()(int s, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE, bool initZero = false) 132 { 133 if (owner && data) { 134 delete[] data; 135 } 136 137 dirty = true; // Assumption! 138 clip = flags; 139 // See comment in constructor. 140 data = new T[s + 3]; 141 owner = 1; 142 size = s; 143 upperBound = size - 1; 144 maxs = size - 2; 145 maxsf = (float)maxs; 146 #ifdef __SSE2__ 147 maxsv = F2V( maxs ); 148 sizeiv = _mm_set1_epi32( (int)(size - 1) ); 149 sizev = F2V( size - 1 ); 150 #endif 151 if (initZero) { 152 clear(); 153 } 154 155 } 156 LUT()157 LUT() 158 { 159 data = nullptr; 160 reset(); 161 #ifdef __SSE2__ 162 maxsv = ZEROV; 163 sizev = ZEROV; 164 sizeiv = _mm_setzero_si128(); 165 #endif 166 } 167 ~LUT()168 ~LUT() 169 { 170 if (owner) { 171 delete[] data; 172 #ifndef NDEBUG 173 data = (T*)0xBAADF00D; 174 #endif 175 } 176 } 177 setClip(int flags)178 void setClip(int flags) 179 { 180 clip = flags; 181 } 182 getClip()183 int getClip() const { 184 return clip; 185 } 186 187 /** @brief Get the number of element in the LUT (i.e. dimension of the array) 188 * For a LUT(500), it will return 500 189 * @return number of element in the array 190 */ getSize()191 unsigned int getSize() const 192 { 193 return size; 194 } 195 196 /** @brief Get the highest value possible (i.e. dimension of the array) 197 * For a LUT(500), it will return 499, because 500 elements, starting from 0, goes up to 499 198 * @return number of element in the array 199 */ getUpperBound()200 unsigned int getUpperBound() const 201 { 202 return size > 0 ? upperBound : 0; 203 } 204 205 LUT<T> & operator=(LUT<T> &rhs) 206 { 207 if (this != &rhs) { 208 if (rhs.size > this->size) { 209 delete [] this->data; 210 this->data = nullptr; 211 } 212 213 if (this->data == nullptr) { 214 // See comment in constructor. 215 this->data = new T[rhs.size + 3]; 216 } 217 218 this->clip = rhs.clip; 219 this->owner = 1; 220 memcpy(this->data, rhs.data, rhs.size * sizeof(T)); 221 this->size = rhs.size; 222 this->upperBound = rhs.upperBound; 223 this->maxs = this->size - 2; 224 this->maxsf = (float)this->maxs; 225 #ifdef __SSE2__ 226 this->maxsv = F2V( this->size - 2); 227 this->sizeiv = _mm_set1_epi32( (int)(this->size - 1) ); 228 this->sizev = F2V( this->size - 1 ); 229 #endif 230 } 231 232 return *this; 233 } 234 235 // handy to sum up per thread histograms. #pragma omp simd speeds up the loop by about factor 3 for LUTu (uint32_t). 236 template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type> 237 LUT<T> & operator+=(LUT<T> &rhs) 238 { 239 if (rhs.size == this->size) { 240 #ifdef _OPENMP 241 #pragma omp simd 242 #endif 243 244 for(unsigned int i = 0; i < this->size; i++) { 245 data[i] += rhs.data[i]; 246 } 247 } 248 249 return *this; 250 } 251 252 // multiply all elements of LUT<float> with a constant float value 253 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 254 LUT<float> & operator*=(float factor) 255 { 256 #ifdef _OPENMP 257 #pragma omp simd 258 #endif 259 260 for(unsigned int i = 0; i < this->size; i++) { 261 data[i] *= factor; 262 } 263 264 return *this; 265 } 266 267 // divide all elements of LUT<float> by a constant float value 268 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 269 LUT<float> & operator/=(float divisor) 270 { 271 #ifdef _OPENMP 272 #pragma omp simd 273 #endif 274 275 for(unsigned int i = 0; i < this->size; i++) { 276 data[i] /= divisor; 277 } 278 279 return *this; 280 } 281 282 283 // use with integer indices 284 T& operator[](int index) const 285 { 286 return data[ librtprocess::LIM<int>(index, 0, upperBound) ]; 287 } 288 289 #ifdef __SSE2__ 290 291 292 // NOTE: This function requires LUTs which clips only at lower bound cb(vfloat indexv)293 vfloat cb(vfloat indexv) const 294 { 295 static_assert(std::is_same<T, float>::value, "This method only works for float LUTs"); 296 297 // Clamp and convert to integer values. Extract out of SSE register because all 298 // lookup operations use regular addresses. 299 vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv)); 300 vint indexes = _mm_cvttps_epi32(clampedIndexes); 301 int indexArray[4]; 302 _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes); 303 304 // Load data from the table. This reads more than necessary, but there don't seem 305 // to exist more granular operations (though we could try non-SSE). 306 // Cast to int for convenience in the next operation (partial transpose). 307 vint values[4]; 308 for (int i = 0; i < 4; ++i) { 309 values[i] = _mm_castps_si128(LVFU(data[indexArray[i]])); 310 } 311 312 // Partial 4x4 transpose operation. We want two new vectors, the first consisting 313 // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]]. 314 __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]); 315 __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]); 316 vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1)); 317 vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1)); 318 319 vfloat diff = vmaxf(ZEROV, indexv) - _mm_cvtepi32_ps(indexes); 320 return vintpf(diff, upper, lower); 321 } 322 323 // NOTE: This version requires LUTs which clip at upper and lower bounds 324 // (which is the default). 325 vfloat operator[](vfloat indexv) const 326 { 327 static_assert(std::is_same<T, float>::value, "This method only works for float LUTs"); 328 329 // Clamp and convert to integer values. Extract out of SSE register because all 330 // lookup operations use regular addresses. 331 vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv)); 332 vint indexes = _mm_cvttps_epi32(clampedIndexes); 333 int indexArray[4]; 334 _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes); 335 336 // Load data from the table. This reads more than necessary, but there don't seem 337 // to exist more granular operations (though we could try non-SSE). 338 // Cast to int for convenience in the next operation (partial transpose). 339 vint values[4]; 340 for (int i = 0; i < 4; ++i) { 341 values[i] = _mm_castps_si128(LVFU(data[indexArray[i]])); 342 } 343 344 // Partial 4x4 transpose operation. We want two new vectors, the first consisting 345 // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]]. 346 __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]); 347 __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]); 348 vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1)); 349 vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1)); 350 351 vfloat diff = vmaxf(ZEROV, vminf(sizev, indexv)) - _mm_cvtepi32_ps(indexes); 352 return vintpf(diff, upper, lower); 353 } 354 355 // NOTE: This version requires LUTs which do not clip at upper and lower bounds operator()356 vfloat operator()(vfloat indexv) const 357 { 358 static_assert(std::is_same<T, float>::value, "This method only works for float LUTs"); 359 360 // Clamp and convert to integer values. Extract out of SSE register because all 361 // lookup operations use regular addresses. 362 vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv)); 363 vint indexes = _mm_cvttps_epi32(clampedIndexes); 364 int indexArray[4]; 365 _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes); 366 367 // Load data from the table. This reads more than necessary, but there don't seem 368 // to exist more granular operations (though we could try non-SSE). 369 // Cast to int for convenience in the next operation (partial transpose). 370 vint values[4]; 371 for (int i = 0; i < 4; ++i) { 372 values[i] = _mm_castps_si128(LVFU(data[indexArray[i]])); 373 } 374 375 // Partial 4x4 transpose operation. We want two new vectors, the first consisting 376 // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]]. 377 __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]); 378 __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]); 379 vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1)); 380 vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1)); 381 382 vfloat diff = indexv - _mm_cvtepi32_ps(indexes); 383 return vintpf(diff, upper, lower); 384 } 385 386 // vectorized LUT access with integer indices. Clips at lower and upper bounds 387 #ifdef __SSE4_1__ 388 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 389 vfloat operator[](vint idxv) const 390 { 391 idxv = _mm_max_epi32( _mm_setzero_si128(), _mm_min_epi32(idxv, sizeiv)); 392 // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code 393 return _mm_setr_ps(data[_mm_extract_epi32(idxv,0)], data[_mm_extract_epi32(idxv,1)], data[_mm_extract_epi32(idxv,2)], data[_mm_extract_epi32(idxv,3)]); 394 } 395 #else 396 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 397 vfloat operator[](vint idxv) const 398 { 399 vfloat tempv = vmaxf(ZEROV, vminf(sizev, _mm_cvtepi32_ps(idxv))); // convert to float because SSE2 has no min/max for 32bit integers 400 idxv = _mm_cvttps_epi32(tempv); 401 // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code 402 return _mm_setr_ps(data[_mm_cvtsi128_si32(idxv)], 403 data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)))], 404 data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)))], 405 data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)))]); 406 } 407 #endif 408 #endif 409 410 // use with float indices 411 template<typename U = T, typename V, typename = typename std::enable_if<std::is_floating_point<V>::value && std::is_same<U, float>::value>::type> 412 T operator[](V index) const 413 { 414 int idx = (int)index; // don't use floor! The difference in negative space is no problems here 415 416 if (index < 0.f) { 417 if (clip & LUT_CLIP_BELOW) { 418 return data[0]; 419 } 420 421 idx = 0; 422 } else if (idx > maxs) { 423 if (clip & LUT_CLIP_ABOVE) { 424 return data[upperBound]; 425 } 426 427 idx = maxs; 428 } 429 430 float diff = index - (float) idx; 431 T p1 = data[idx]; 432 T p2 = data[idx + 1] - p1; 433 return (p1 + p2 * diff); 434 } 435 436 // Return the value for "index" that is in the [0-1] range. 437 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> getVal01(float index)438 T getVal01 (float index) const 439 { 440 index *= (float)upperBound; 441 int idx = (int)index; // don't use floor! The difference in negative space is no problems here 442 443 if (index < 0.f) { 444 if (clip & LUT_CLIP_BELOW) { 445 return data[0]; 446 } 447 448 idx = 0; 449 } else if (index > maxsf) { 450 if (clip & LUT_CLIP_ABOVE) { 451 return data[upperBound]; 452 } 453 454 idx = maxs; 455 } 456 457 float diff = index - (float) idx; 458 T p1 = data[idx]; 459 T p2 = data[idx + 1] - p1; 460 return (p1 + p2 * diff); 461 } 462 463 operator bool (void) const 464 { 465 return size > 0; 466 } 467 clear(void)468 void clear(void) 469 { 470 if (data && size) { 471 memset(data, 0, size * sizeof(T)); 472 } 473 } 474 reset(void)475 void reset(void) 476 { 477 if (data) { 478 delete[] data; 479 } 480 481 dirty = true; 482 data = nullptr; 483 owner = 1; 484 size = 0; 485 upperBound = 0; 486 maxs = 0; 487 maxsf = 0.f; 488 clip = 0; 489 } 490 491 // create an identity LUT (LUT(x) = x) or a scaled identity LUT (LUT(x) = x / divisor) 492 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 493 void makeIdentity(float divisor = 1.f) 494 { 495 if(divisor == 1.f) { 496 for(unsigned int i = 0; i < size; i++) { 497 data[i] = i; 498 } 499 } else { 500 for(unsigned int i = 0; i < size; i++) { 501 data[i] = i / divisor; 502 } 503 } 504 } 505 506 // compress a LUT<uint32_t> with size y into a LUT<uint32_t> with size x (y>x) 507 template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type> 508 void compressTo(LUT<T> &dest, unsigned int numVals = 0) const 509 { 510 numVals = numVals == 0 ? size : numVals; 511 numVals = std::min(numVals, size); 512 float divisor = numVals - 1; 513 float mult = (dest.size - 1) / divisor; 514 515 for (unsigned int i = 0; i < numVals; i++) { 516 int hi = (int)(mult * i); 517 dest.data[hi] += this->data[i] ; 518 } 519 } 520 521 // compress a LUT<uint32_t> with size y into a LUT<uint32_t> with size x (y>x) by using the passTrough LUT to calculate indexes 522 template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type> compressTo(LUT<T> & dest,unsigned int numVals,const LUT<float> & passThrough)523 void compressTo(LUT<T> &dest, unsigned int numVals, const LUT<float> &passThrough) const 524 { 525 if(passThrough) { 526 numVals = std::min(numVals, size); 527 numVals = std::min(numVals, passThrough.getSize()); 528 float mult = dest.size - 1; 529 530 for (unsigned int i = 0; i < numVals; i++) { 531 int hi = (int)(mult * passThrough[i]); 532 dest[hi] += this->data[i] ; 533 } 534 } 535 } 536 537 // compute sum and average of a LUT<uint32_t> 538 template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type> getSumAndAverage(float & sum,float & avg)539 void getSumAndAverage(float &sum, float &avg) const 540 { 541 sum = 0.f; 542 avg = 0.f; 543 int i = 0; 544 #ifdef __SSE2__ 545 vfloat iv = _mm_set_ps(3.f, 2.f, 1.f, 0.f); 546 vfloat fourv = F2V(4.f); 547 vint sumv = (vint)ZEROV; 548 vfloat avgv = ZEROV; 549 550 for(; i < static_cast<int>(size) - 3; i += 4) { 551 vint datav = _mm_loadu_si128((__m128i*)&data[i]); 552 sumv += datav; 553 avgv += iv * _mm_cvtepi32_ps(datav); 554 iv += fourv; 555 556 } 557 558 sum = vhadd(_mm_cvtepi32_ps(sumv)); 559 avg = vhadd(avgv); 560 #endif 561 562 for (; i < static_cast<int>(size); i++) { 563 T val = data[i]; 564 sum += val; 565 avg += i * val; 566 } 567 568 avg /= sum; 569 } 570 571 572 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 573 void makeConstant(float value, unsigned int numVals = 0) 574 { 575 numVals = numVals == 0 ? size : numVals; 576 numVals = std::min(numVals, size); 577 578 for(unsigned int i = 0; i < numVals; i++) { 579 data[i] = value; 580 } 581 } 582 583 // share the buffer with another LUT, handy for same data but different clip flags 584 void share(const LUT<T> &source, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE) 585 { 586 if (owner && data) { 587 delete[] data; 588 } 589 590 dirty = false; // Assumption 591 clip = flags; 592 data = source.data; 593 owner = 0; 594 size = source.getSize(); 595 upperBound = size - 1; 596 maxs = size - 2; 597 maxsf = (float)maxs; 598 #ifdef __SSE2__ 599 maxsv = F2V( size - 2); 600 sizeiv = _mm_set1_epi32( (int)(size - 1) ); 601 sizev = F2V( size - 1 ); 602 #endif 603 } 604 605 606 }; 607 608 #endif /* LUT_H_ */ 609