1 /* 2 * LUT.h 3 * This file is part of RawTherapee. 4 * 5 * Copyright (c) 2011 Jan Rinze Peterzon (janrinze@gmail.com) 6 * 7 * RawTherapee is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * RawTherapee is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with RawTherapee. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 21 /* 22 * Declaration of flexible Lookup Tables 23 * 24 * Usage: 25 * 26 * LUT<type> name (size); 27 * LUT<type> name (size, flags); 28 * 29 * creates an array which is valid within the normal C/C++ scope "{ ... }" 30 * 31 * access to elements is a simple as: 32 * 33 * LUT<float> my_lut (10); 34 * float value = my_lut[3]; 35 * float value = my_lut[2.5]; // this will interpolate 36 * 37 * when using a float type index it will interpolate the lookup values 38 * 39 * extra setting in flags: (clipping is set by default) 40 * LUT_CLIP_ABOVE 41 * LUT_CLIP_BELOW 42 * 43 * example: 44 * LUT<float> my_lut (10,LUT_CLIP_BELOW); 45 * float value = my_lut[22.5]; // this will extrapolate 46 * float value = my_lut[-22.5]; // this will not extrapolate 47 * 48 * LUT<float> my_lut (10,0); // this will extrapolate on either side 49 * 50 * shotcuts: 51 * 52 * LUTf stands for LUT<float> 53 * LUTi stands for LUT<int> 54 * LUTu stands for LUT<unsigned int> 55 * LUTd stands for LUT<double> 56 * LUTuc stands for LUT<unsigned char> 57 */ 58 59 #ifndef LUT_H_ 60 #define LUT_H_ 61 62 #include <cstring> 63 #include <cstdint> 64 #include <cassert> 65 66 #ifndef NDEBUG 67 #include <glibmm.h> 68 #include <fstream> 69 #endif 70 71 #include "opthelper.h" 72 #include "rt_math.h" 73 #include "noncopyable.h" 74 75 // Bit representations of flags 76 enum { 77 LUT_CLIP_OFF, // LUT does not clip input values 78 LUT_CLIP_BELOW, // LUT clips input values at lower bound 79 LUT_CLIP_ABOVE // LUT clips input values at upper bound 80 }; 81 82 template<typename T> 83 class LUT; 84 85 using LUTf = LUT<float>; 86 using LUTi = LUT<int32_t>; 87 using LUTu = LUT<uint32_t>; 88 using LUTd = LUT<double>; 89 using LUTuc = LUT<uint8_t>; 90 91 template<typename T> 92 class LUT 93 { 94 protected: 95 // list of variables ordered to improve cache speed 96 int maxs; 97 float maxsf; 98 T * data; 99 unsigned int clip; 100 unsigned int size; 101 unsigned int upperBound; // always equals size-1, parameter created for performance reason 102 private: 103 unsigned int owner; 104 #ifdef __SSE2__ 105 alignas(16) vfloat maxsv; 106 alignas(16) vfloat sizev; 107 alignas(16) vint sizeiv; 108 #endif 109 public: 110 /// convenience flag! If one doesn't want to delete the buffer but want to flag it to be recomputed... 111 /// The user have to handle it itself, even if some method can (re)initialize it 112 bool dirty; 113 114 LUT(int s, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE, bool initZero = false) 115 { 116 #ifndef NDEBUG 117 118 if (s <= 0) { 119 printf("s<=0!\n"); 120 } 121 122 assert (s > 0); 123 #endif 124 dirty = true; 125 clip = flags; 126 // Add a few extra elements so [](vfloat) won't access out-of-bounds memory. 127 // The routine would still produce the right answer, but might cause issues 128 // with address/heap checking programs. 129 data = new T[s + 3]; 130 owner = 1; 131 size = s; 132 upperBound = size - 1; 133 maxs = size - 2; 134 maxsf = (float)maxs; 135 #ifdef __SSE2__ 136 maxsv = F2V( maxs ); 137 sizeiv = _mm_set1_epi32( (int)(size - 1) ); 138 sizev = F2V( size - 1 ); 139 #endif 140 if (initZero) { 141 clear(); 142 } 143 } operator()144 void operator ()(int s, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE, bool initZero = false) 145 { 146 #ifndef NDEBUG 147 148 if (s <= 0) { 149 printf("s<=0!\n"); 150 } 151 152 assert (s > 0); 153 #endif 154 155 if (owner && data) { 156 delete[] data; 157 } 158 159 dirty = true; // Assumption! 160 clip = flags; 161 // See comment in constructor. 162 data = new T[s + 3]; 163 owner = 1; 164 size = s; 165 upperBound = size - 1; 166 maxs = size - 2; 167 maxsf = (float)maxs; 168 #ifdef __SSE2__ 169 maxsv = F2V( maxs ); 170 sizeiv = _mm_set1_epi32( (int)(size - 1) ); 171 sizev = F2V( size - 1 ); 172 #endif 173 if (initZero) { 174 clear(); 175 } 176 177 } 178 LUT()179 LUT() 180 { 181 data = nullptr; 182 reset(); 183 #ifdef __SSE2__ 184 maxsv = ZEROV; 185 sizev = ZEROV; 186 sizeiv = _mm_setzero_si128(); 187 #endif 188 } 189 ~LUT()190 ~LUT() 191 { 192 if (owner) { 193 delete[] data; 194 #ifndef NDEBUG 195 data = (T*)0xBAADF00D; 196 #endif 197 } 198 } 199 200 explicit LUT(const LUT&) = delete; 201 setClip(int flags)202 void setClip(int flags) 203 { 204 clip = flags; 205 } 206 getClip()207 int getClip() const { 208 return clip; 209 } 210 211 /** @brief Get the number of element in the LUT (i.e. dimension of the array) 212 * For a LUT(500), it will return 500 213 * @return number of element in the array 214 */ getSize()215 unsigned int getSize() const 216 { 217 return size; 218 } 219 220 /** @brief Get the highest value possible (i.e. dimension of the array) 221 * For a LUT(500), it will return 499, because 500 elements, starting from 0, goes up to 499 222 * @return number of element in the array 223 */ getUpperBound()224 unsigned int getUpperBound() const 225 { 226 return size > 0 ? upperBound : 0; 227 } 228 229 LUT<T> & operator=(const LUT<T>& rhs) 230 { 231 if (this != &rhs) { 232 if (rhs.size > this->size) { 233 delete [] this->data; 234 this->data = nullptr; 235 } 236 237 if (this->data == nullptr) { 238 // See comment in constructor. 239 this->data = new T[rhs.size + 3]; 240 } 241 242 this->clip = rhs.clip; 243 this->owner = 1; 244 memcpy(this->data, rhs.data, rhs.size * sizeof(T)); 245 this->size = rhs.size; 246 this->upperBound = rhs.upperBound; 247 this->maxs = this->size - 2; 248 this->maxsf = (float)this->maxs; 249 #ifdef __SSE2__ 250 this->maxsv = F2V( this->size - 2); 251 this->sizeiv = _mm_set1_epi32( (int)(this->size - 1) ); 252 this->sizev = F2V( this->size - 1 ); 253 #endif 254 } 255 256 return *this; 257 } 258 259 // handy to sum up per thread histograms. #pragma omp simd speeds up the loop by about factor 3 for LUTu (uint32_t). 260 template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type> 261 LUT<T> & operator+=(const LUT<T>& rhs) 262 { 263 if (rhs.size == this->size) { 264 #ifdef _OPENMP 265 #pragma omp simd 266 #endif 267 268 for(unsigned int i = 0; i < this->size; i++) { 269 data[i] += rhs.data[i]; 270 } 271 } 272 273 return *this; 274 } 275 276 // multiply all elements of LUT<float> with a constant float value 277 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 278 LUT<float> & operator*=(float factor) 279 { 280 #ifdef _OPENMP 281 #pragma omp simd 282 #endif 283 284 for(unsigned int i = 0; i < this->size; i++) { 285 data[i] *= factor; 286 } 287 288 return *this; 289 } 290 291 // divide all elements of LUT<float> by a constant float value 292 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 293 LUT<float> & operator/=(float divisor) 294 { 295 #ifdef _OPENMP 296 #pragma omp simd 297 #endif 298 299 for(unsigned int i = 0; i < this->size; i++) { 300 data[i] /= divisor; 301 } 302 303 return *this; 304 } 305 306 307 // use with integer indices 308 T& operator[](int index) const 309 { 310 return data[ rtengine::LIM<int>(index, 0, upperBound) ]; 311 } 312 313 #ifdef __SSE2__ 314 315 316 // NOTE: This function requires LUTs which clips only at lower bound cb(vfloat indexv)317 vfloat cb(vfloat indexv) const 318 { 319 static_assert(std::is_same<T, float>::value, "This method only works for float LUTs"); 320 321 // Clamp and convert to integer values. Extract out of SSE register because all 322 // lookup operations use regular addresses. 323 vfloat clampedIndexes = vclampf(indexv, ZEROV, maxsv); // this automagically uses ZEROV in case indexv is NaN 324 vint indexes = _mm_cvttps_epi32(clampedIndexes); 325 int indexArray[4]; 326 _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes); 327 328 // Load data from the table. This reads more than necessary, but there don't seem 329 // to exist more granular operations (though we could try non-SSE). 330 // Cast to int for convenience in the next operation (partial transpose). 331 vint values[4]; 332 for (int i = 0; i < 4; ++i) { 333 values[i] = _mm_castps_si128(LVFU(data[indexArray[i]])); 334 } 335 336 // Partial 4x4 transpose operation. We want two new vectors, the first consisting 337 // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]]. 338 __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]); 339 __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]); 340 vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1)); 341 vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1)); 342 343 vfloat diff = vmaxf(ZEROV, indexv) - _mm_cvtepi32_ps(indexes); 344 return vintpf(diff, upper, lower); 345 } 346 347 // NOTE: This version requires LUTs which clip at upper and lower bounds 348 // (which is the default). 349 vfloat operator[](vfloat indexv) const 350 { 351 static_assert(std::is_same<T, float>::value, "This method only works for float LUTs"); 352 353 // Clamp and convert to integer values. Extract out of SSE register because all 354 // lookup operations use regular addresses. 355 vfloat clampedIndexes = vclampf(indexv, ZEROV, maxsv); // this automagically uses ZEROV in case indexv is NaN 356 vint indexes = _mm_cvttps_epi32(clampedIndexes); 357 int indexArray[4]; 358 _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes); 359 360 // Load data from the table. This reads more than necessary, but there don't seem 361 // to exist more granular operations (though we could try non-SSE). 362 // Cast to int for convenience in the next operation (partial transpose). 363 vint values[4]; 364 for (int i = 0; i < 4; ++i) { 365 values[i] = _mm_castps_si128(LVFU(data[indexArray[i]])); 366 } 367 368 // Partial 4x4 transpose operation. We want two new vectors, the first consisting 369 // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]]. 370 __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]); 371 __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]); 372 vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1)); 373 vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1)); 374 375 vfloat diff = vclampf(indexv, ZEROV, sizev) - _mm_cvtepi32_ps(indexes); // this automagically uses ZEROV in case indexv is NaN 376 return vintpf(diff, upper, lower); 377 } 378 379 // NOTE: This version requires LUTs which do not clip at upper and lower bounds operator()380 vfloat operator()(vfloat indexv) const 381 { 382 static_assert(std::is_same<T, float>::value, "This method only works for float LUTs"); 383 384 // Clamp and convert to integer values. Extract out of SSE register because all 385 // lookup operations use regular addresses. 386 vfloat clampedIndexes = vclampf(indexv, ZEROV, maxsv); // this automagically uses ZEROV in case indexv is NaN 387 vint indexes = _mm_cvttps_epi32(clampedIndexes); 388 int indexArray[4]; 389 _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes); 390 391 // Load data from the table. This reads more than necessary, but there don't seem 392 // to exist more granular operations (though we could try non-SSE). 393 // Cast to int for convenience in the next operation (partial transpose). 394 vint values[4]; 395 for (int i = 0; i < 4; ++i) { 396 values[i] = _mm_castps_si128(LVFU(data[indexArray[i]])); 397 } 398 399 // Partial 4x4 transpose operation. We want two new vectors, the first consisting 400 // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]]. 401 __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]); 402 __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]); 403 vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1)); 404 vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1)); 405 406 vfloat diff = indexv - _mm_cvtepi32_ps(indexes); 407 return vintpf(diff, upper, lower); 408 } 409 410 // vectorized LUT access with integer indices. Clips at lower and upper bounds 411 #ifdef __SSE4_1__ 412 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 413 vfloat operator[](vint idxv) const 414 { 415 idxv = _mm_max_epi32( _mm_setzero_si128(), _mm_min_epi32(idxv, sizeiv)); 416 // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code 417 return _mm_setr_ps(data[_mm_extract_epi32(idxv,0)], data[_mm_extract_epi32(idxv,1)], data[_mm_extract_epi32(idxv,2)], data[_mm_extract_epi32(idxv,3)]); 418 } 419 #else 420 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 421 vfloat operator[](vint idxv) const 422 { 423 // convert to float because SSE2 has no min/max for 32bit integers 424 vfloat tempv = vclampf(_mm_cvtepi32_ps(idxv), ZEROV, sizev); // this automagically uses ZEROV in case idxv is NaN (which will never happen because it is a vector of int) 425 idxv = _mm_cvttps_epi32(tempv); 426 // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code 427 return _mm_setr_ps(data[_mm_cvtsi128_si32(idxv)], 428 data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)))], 429 data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)))], 430 data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)))]); 431 } 432 #endif 433 #endif 434 435 // use with float indices 436 template<typename U = T, typename V, typename = typename std::enable_if<std::is_floating_point<V>::value && std::is_same<U, float>::value>::type> 437 T operator[](V index) const 438 { 439 int idx = (int)index; // don't use floor! The difference in negative space is no problems here 440 441 if (index < 0.f) { 442 if (clip & LUT_CLIP_BELOW) { 443 return data[0]; 444 } 445 446 idx = 0; 447 } else if (index > maxsf) { 448 if (clip & LUT_CLIP_ABOVE) { 449 return data[upperBound]; 450 } 451 452 idx = maxs; 453 } 454 455 float diff = index - (float) idx; 456 T p1 = data[idx]; 457 T p2 = data[idx + 1] - p1; 458 return (p1 + p2 * diff); 459 } 460 461 // Return the value for "index" that is in the [0-1] range. 462 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> getVal01(float index)463 T getVal01 (float index) const 464 { 465 index *= (float)upperBound; 466 int idx = (int)index; // don't use floor! The difference in negative space is no problems here 467 468 if (index < 0.f) { 469 if (clip & LUT_CLIP_BELOW) { 470 return data[0]; 471 } 472 473 idx = 0; 474 } else if (index > maxsf) { 475 if (clip & LUT_CLIP_ABOVE) { 476 return data[upperBound]; 477 } 478 479 idx = maxs; 480 } 481 482 float diff = index - (float) idx; 483 T p1 = data[idx]; 484 T p2 = data[idx + 1] - p1; 485 return (p1 + p2 * diff); 486 } 487 488 #ifndef NDEBUG 489 // Debug facility ; dump the content of the LUT in a file. No control of the filename is done dump(Glib::ustring fname)490 void dump(Glib::ustring fname) 491 { 492 if (size) { 493 Glib::ustring fname_ = fname + ".xyz"; // TopSolid'Design "plot" file format 494 std::ofstream f (fname_.c_str()); 495 f << "$" << std::endl; 496 497 for (unsigned int iter = 0; iter < size; iter++) { 498 f << iter << ", " << data[iter] << ", 0." << std::endl; 499 } 500 501 f << "$" << std::endl; 502 f.close (); 503 } 504 } 505 #endif 506 507 508 operator bool (void) const 509 { 510 return size > 0; 511 } 512 clear(void)513 void clear(void) 514 { 515 if (data && size) { 516 memset(data, 0, size * sizeof(T)); 517 } 518 } 519 reset(void)520 void reset(void) 521 { 522 if (data) { 523 delete[] data; 524 } 525 526 dirty = true; 527 data = nullptr; 528 owner = 1; 529 size = 0; 530 upperBound = 0; 531 maxs = 0; 532 maxsf = 0.f; 533 clip = 0; 534 } 535 536 // create an identity LUT (LUT(x) = x) or a scaled identity LUT (LUT(x) = x / divisor) 537 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 538 void makeIdentity(float divisor = 1.f) 539 { 540 if(divisor == 1.f) { 541 for(unsigned int i = 0; i < size; i++) { 542 data[i] = i; 543 } 544 } else { 545 for(unsigned int i = 0; i < size; i++) { 546 data[i] = i / divisor; 547 } 548 } 549 } 550 551 // compress a LUT<uint32_t> with size y into a LUT<uint32_t> with size x (y>x) 552 template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type> 553 void compressTo(LUT<T> &dest, unsigned int numVals = 0) const 554 { 555 numVals = numVals == 0 ? size : numVals; 556 numVals = std::min(numVals, size); 557 float divisor = numVals - 1; 558 float mult = (dest.size - 1) / divisor; 559 560 for (unsigned int i = 0; i < numVals; i++) { 561 int hi = (int)(mult * i); 562 dest.data[hi] += this->data[i] ; 563 } 564 } 565 566 // compress a LUT<uint32_t> with size y into a LUT<uint32_t> with size x (y>x) by using the passTrough LUT to calculate indexes 567 template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type> compressTo(LUT<T> & dest,unsigned int numVals,const LUT<float> & passThrough)568 void compressTo(LUT<T> &dest, unsigned int numVals, const LUT<float> &passThrough) const 569 { 570 if(passThrough) { 571 numVals = std::min(numVals, size); 572 numVals = std::min(numVals, passThrough.getSize()); 573 float mult = dest.size - 1; 574 575 for (unsigned int i = 0; i < numVals; i++) { 576 int hi = (int)(mult * passThrough[i]); 577 dest[hi] += this->data[i] ; 578 } 579 } 580 } 581 582 // compute sum and average of a LUT<uint32_t> 583 template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type> getSumAndAverage(float & sum,float & avg)584 void getSumAndAverage(float &sum, float &avg) const 585 { 586 sum = 0.f; 587 avg = 0.f; 588 int i = 0; 589 #ifdef __SSE2__ 590 vfloat iv = _mm_set_ps(3.f, 2.f, 1.f, 0.f); 591 vfloat fourv = F2V(4.f); 592 vint sumv = (vint)ZEROV; 593 vfloat avgv = ZEROV; 594 595 for(; i < static_cast<int>(size) - 3; i += 4) { 596 vint datav = _mm_loadu_si128((__m128i*)&data[i]); 597 sumv += datav; 598 avgv += iv * _mm_cvtepi32_ps(datav); 599 iv += fourv; 600 601 } 602 603 sum = vhadd(_mm_cvtepi32_ps(sumv)); 604 avg = vhadd(avgv); 605 #endif 606 607 for (; i < static_cast<int>(size); i++) { 608 T val = data[i]; 609 sum += val; 610 avg += i * val; 611 } 612 613 avg /= sum; 614 } 615 616 617 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 618 void makeConstant(float value, unsigned int numVals = 0) 619 { 620 numVals = numVals == 0 ? size : numVals; 621 numVals = std::min(numVals, size); 622 623 for(unsigned int i = 0; i < numVals; i++) { 624 data[i] = value; 625 } 626 } 627 628 // share the buffer with another LUT, handy for same data but different clip flags 629 void share(const LUT<T> &source, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE) 630 { 631 if (owner && data) { 632 delete[] data; 633 } 634 635 dirty = false; // Assumption 636 clip = flags; 637 data = source.data; 638 owner = 0; 639 size = source.getSize(); 640 upperBound = size - 1; 641 maxs = size - 2; 642 maxsf = (float)maxs; 643 #ifdef __SSE2__ 644 maxsv = F2V( size - 2); 645 sizeiv = _mm_set1_epi32( (int)(size - 1) ); 646 sizev = F2V( size - 1 ); 647 #endif 648 } 649 650 651 }; 652 653 #endif /* LUT_H_ */ 654