1 /* 2 * LUT.h 3 * This file is part of RawTherapee. 4 * 5 * Copyright (c) 2011 Jan Rinze Peterzon (janrinze@gmail.com) 6 * 7 * RawTherapee is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * RawTherapee is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with RawTherapee. If not, see <https://www.gnu.org/licenses/>. 19 */ 20 21 /* 22 * Declaration of flexible Lookup Tables 23 * 24 * Usage: 25 * 26 * LUT<type> name (size); 27 * LUT<type> name (size, flags); 28 * 29 * creates an array which is valid within the normal C/C++ scope "{ ... }" 30 * 31 * access to elements is a simple as: 32 * 33 * LUT<float> my_lut (10); 34 * float value = my_lut[3]; 35 * float value = my_lut[2.5]; // this will interpolate 36 * 37 * when using a float type index it will interpolate the lookup values 38 * 39 * extra setting in flags: (clipping is set by default) 40 * LUT_CLIP_ABOVE 41 * LUT_CLIP_BELOW 42 * 43 * example: 44 * LUT<float> my_lut (10,LUT_CLIP_BELOW); 45 * float value = my_lut[22.5]; // this will extrapolate 46 * float value = my_lut[-22.5]; // this will not extrapolate 47 * 48 * LUT<float> my_lut (10,0); // this will extrapolate on either side 49 * 50 * shotcuts: 51 * 52 * LUTf stands for LUT<float> 53 * LUTi stands for LUT<int> 54 * LUTu stands for LUT<unsigned int> 55 * LUTd stands for LUT<double> 56 * LUTuc stands for LUT<unsigned char> 57 */ 58 59 #pragma once 60 61 #include <cstring> 62 #include <cstdint> 63 #include <cassert> 64 65 #ifndef NDEBUG 66 #include <fstream> 67 #endif 68 69 #include "opthelper.h" 70 #include "rt_math.h" 71 72 // Bit representations of flags 73 enum { 74 LUT_CLIP_OFF, // LUT does not clip input values 75 LUT_CLIP_BELOW, // LUT clips input values at lower bound 76 LUT_CLIP_ABOVE // LUT clips input values at upper bound 77 }; 78 79 template<typename T> 80 class LUT; 81 82 using LUTf = LUT<float>; 83 using LUTi = LUT<int32_t>; 84 using LUTu = LUT<uint32_t>; 85 using LUTd = LUT<double>; 86 using LUTuc = LUT<uint8_t>; 87 88 template<typename T> 89 class LUT 90 { 91 protected: 92 // list of variables ordered to improve cache speed 93 int maxs; 94 float maxsf; 95 T * data; 96 unsigned int clip; 97 unsigned int size; 98 unsigned int upperBound; // always equals size-1, parameter created for performance reason 99 private: 100 unsigned int owner; 101 #ifdef __SSE2__ 102 alignas(16) vfloat maxsv; 103 alignas(16) vfloat sizev; 104 alignas(16) vint sizeiv; 105 #endif 106 public: 107 /// convenience flag! If one doesn't want to delete the buffer but want to flag it to be recomputed... 108 /// The user have to handle it itself, even if some method can (re)initialize it 109 bool dirty; 110 111 LUT(int s, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE, bool initZero = false) 112 { 113 #ifndef NDEBUG 114 115 if (s <= 0) { 116 printf("s<=0!\n"); 117 } 118 119 assert (s > 0); 120 #endif 121 dirty = true; 122 clip = flags; 123 // Add a few extra elements so [](vfloat) won't access out-of-bounds memory. 124 // The routine would still produce the right answer, but might cause issues 125 // with address/heap checking programs. 126 data = new T[s + 3]; 127 owner = 1; 128 size = s; 129 upperBound = size - 1; 130 maxs = size - 2; 131 maxsf = (float)maxs; 132 #ifdef __SSE2__ 133 maxsv = F2V( maxs ); 134 sizeiv = _mm_set1_epi32( (int)(size - 1) ); 135 sizev = F2V( size - 1 ); 136 #endif 137 if (initZero) { 138 clear(); 139 } 140 } operator()141 void operator ()(int s, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE, bool initZero = false) 142 { 143 #ifndef NDEBUG 144 145 if (s <= 0) { 146 printf("s<=0!\n"); 147 } 148 149 assert (s > 0); 150 #endif 151 152 if (owner && data) { 153 delete[] data; 154 } 155 156 dirty = true; // Assumption! 157 clip = flags; 158 // See comment in constructor. 159 data = new T[s + 3]; 160 owner = 1; 161 size = s; 162 upperBound = size - 1; 163 maxs = size - 2; 164 maxsf = (float)maxs; 165 #ifdef __SSE2__ 166 maxsv = F2V( maxs ); 167 sizeiv = _mm_set1_epi32( (int)(size - 1) ); 168 sizev = F2V( size - 1 ); 169 #endif 170 if (initZero) { 171 clear(); 172 } 173 174 } 175 LUT()176 LUT() 177 { 178 data = nullptr; 179 reset(); 180 #ifdef __SSE2__ 181 maxsv = ZEROV; 182 sizev = ZEROV; 183 sizeiv = _mm_setzero_si128(); 184 #endif 185 } 186 ~LUT()187 ~LUT() 188 { 189 if (owner) { 190 delete[] data; 191 #ifndef NDEBUG 192 data = (T*)0xBAADF00D; 193 #endif 194 } 195 } 196 197 explicit LUT(const LUT&) = delete; 198 setClip(int flags)199 void setClip(int flags) 200 { 201 clip = flags; 202 } 203 getClip()204 int getClip() const { 205 return clip; 206 } 207 208 /** @brief Get the number of element in the LUT (i.e. dimension of the array) 209 * For a LUT(500), it will return 500 210 * @return number of element in the array 211 */ getSize()212 unsigned int getSize() const 213 { 214 return size; 215 } 216 217 /** @brief Get the highest value possible (i.e. dimension of the array) 218 * For a LUT(500), it will return 499, because 500 elements, starting from 0, goes up to 499 219 * @return number of element in the array 220 */ getUpperBound()221 unsigned int getUpperBound() const 222 { 223 return size > 0 ? upperBound : 0; 224 } 225 226 LUT<T> & operator=(const LUT<T>& rhs) 227 { 228 if (this != &rhs) { 229 if (rhs.size > this->size) { 230 delete [] this->data; 231 this->data = nullptr; 232 } 233 234 if (this->data == nullptr) { 235 // See comment in constructor. 236 this->data = new T[rhs.size + 3]; 237 } 238 239 this->clip = rhs.clip; 240 this->owner = 1; 241 memcpy(this->data, rhs.data, rhs.size * sizeof(T)); 242 this->size = rhs.size; 243 this->upperBound = rhs.upperBound; 244 this->maxs = this->size - 2; 245 this->maxsf = (float)this->maxs; 246 #ifdef __SSE2__ 247 this->maxsv = F2V( this->size - 2); 248 this->sizeiv = _mm_set1_epi32( (int)(this->size - 1) ); 249 this->sizev = F2V( this->size - 1 ); 250 #endif 251 } 252 253 return *this; 254 } 255 256 // handy to sum up per thread histograms. #pragma omp simd speeds up the loop by about factor 3 for LUTu (uint32_t). 257 template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type> 258 LUT<T> & operator+=(const LUT<T>& rhs) 259 { 260 if (rhs.size == this->size) { 261 #ifdef _OPENMP 262 #pragma omp simd 263 #endif 264 265 for(unsigned int i = 0; i < this->size; i++) { 266 data[i] += rhs.data[i]; 267 } 268 } 269 270 return *this; 271 } 272 273 // multiply all elements of LUT<float> with a constant float value 274 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 275 LUT<float> & operator*=(float factor) 276 { 277 #ifdef _OPENMP 278 #pragma omp simd 279 #endif 280 281 for(unsigned int i = 0; i < this->size; i++) { 282 data[i] *= factor; 283 } 284 285 return *this; 286 } 287 288 // divide all elements of LUT<float> by a constant float value 289 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 290 LUT<float> & operator/=(float divisor) 291 { 292 #ifdef _OPENMP 293 #pragma omp simd 294 #endif 295 296 for(unsigned int i = 0; i < this->size; i++) { 297 data[i] /= divisor; 298 } 299 300 return *this; 301 } 302 303 304 // use with integer indices 305 T& operator[](int index) const 306 { 307 return data[ rtengine::LIM<int>(index, 0, upperBound) ]; 308 } 309 310 #ifdef __SSE2__ 311 312 313 // NOTE: This function requires LUTs which clips only at lower bound cb(vfloat indexv)314 vfloat cb(vfloat indexv) const 315 { 316 static_assert(std::is_same<T, float>::value, "This method only works for float LUTs"); 317 318 // Clamp and convert to integer values. Extract out of SSE register because all 319 // lookup operations use regular addresses. 320 vfloat clampedIndexes = vclampf(indexv, ZEROV, maxsv); // this automagically uses ZEROV in case indexv is NaN 321 vint indexes = _mm_cvttps_epi32(clampedIndexes); 322 int indexArray[4]; 323 _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes); 324 325 // Load data from the table. This reads more than necessary, but there don't seem 326 // to exist more granular operations (though we could try non-SSE). 327 // Cast to int for convenience in the next operation (partial transpose). 328 vint values[4]; 329 for (int i = 0; i < 4; ++i) { 330 values[i] = _mm_castps_si128(LVFU(data[indexArray[i]])); 331 } 332 333 // Partial 4x4 transpose operation. We want two new vectors, the first consisting 334 // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]]. 335 __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]); 336 __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]); 337 vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1)); 338 vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1)); 339 340 vfloat diff = vmaxf(ZEROV, indexv) - _mm_cvtepi32_ps(indexes); 341 return vintpf(diff, upper, lower); 342 } 343 344 // NOTE: This version requires LUTs which clip at upper and lower bounds 345 // (which is the default). 346 vfloat operator[](vfloat indexv) const 347 { 348 static_assert(std::is_same<T, float>::value, "This method only works for float LUTs"); 349 350 // Clamp and convert to integer values. Extract out of SSE register because all 351 // lookup operations use regular addresses. 352 vfloat clampedIndexes = vclampf(indexv, ZEROV, maxsv); // this automagically uses ZEROV in case indexv is NaN 353 vint indexes = _mm_cvttps_epi32(clampedIndexes); 354 int indexArray[4]; 355 _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes); 356 357 // Load data from the table. This reads more than necessary, but there don't seem 358 // to exist more granular operations (though we could try non-SSE). 359 // Cast to int for convenience in the next operation (partial transpose). 360 vint values[4]; 361 for (int i = 0; i < 4; ++i) { 362 values[i] = _mm_castps_si128(LVFU(data[indexArray[i]])); 363 } 364 365 // Partial 4x4 transpose operation. We want two new vectors, the first consisting 366 // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]]. 367 __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]); 368 __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]); 369 vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1)); 370 vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1)); 371 372 vfloat diff = vclampf(indexv, ZEROV, sizev) - _mm_cvtepi32_ps(indexes); // this automagically uses ZEROV in case indexv is NaN 373 return vintpf(diff, upper, lower); 374 } 375 376 // NOTE: This version requires LUTs which do not clip at upper and lower bounds operator()377 vfloat operator()(vfloat indexv) const 378 { 379 static_assert(std::is_same<T, float>::value, "This method only works for float LUTs"); 380 381 // Clamp and convert to integer values. Extract out of SSE register because all 382 // lookup operations use regular addresses. 383 vfloat clampedIndexes = vclampf(indexv, ZEROV, maxsv); // this automagically uses ZEROV in case indexv is NaN 384 vint indexes = _mm_cvttps_epi32(clampedIndexes); 385 int indexArray[4]; 386 _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes); 387 388 // Load data from the table. This reads more than necessary, but there don't seem 389 // to exist more granular operations (though we could try non-SSE). 390 // Cast to int for convenience in the next operation (partial transpose). 391 vint values[4]; 392 for (int i = 0; i < 4; ++i) { 393 values[i] = _mm_castps_si128(LVFU(data[indexArray[i]])); 394 } 395 396 // Partial 4x4 transpose operation. We want two new vectors, the first consisting 397 // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]]. 398 __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]); 399 __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]); 400 vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1)); 401 vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1)); 402 403 vfloat diff = indexv - _mm_cvtepi32_ps(indexes); 404 return vintpf(diff, upper, lower); 405 } 406 407 // vectorized LUT access with integer indices. Clips at lower and upper bounds 408 #ifdef __SSE4_1__ 409 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 410 vfloat operator[](vint idxv) const 411 { 412 idxv = _mm_max_epi32( _mm_setzero_si128(), _mm_min_epi32(idxv, sizeiv)); 413 // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code 414 return _mm_setr_ps(data[_mm_extract_epi32(idxv,0)], data[_mm_extract_epi32(idxv,1)], data[_mm_extract_epi32(idxv,2)], data[_mm_extract_epi32(idxv,3)]); 415 } 416 #else 417 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 418 vfloat operator[](vint idxv) const 419 { 420 // convert to float because SSE2 has no min/max for 32bit integers 421 vfloat tempv = vclampf(_mm_cvtepi32_ps(idxv), ZEROV, sizev); // this automagically uses ZEROV in case idxv is NaN (which will never happen because it is a vector of int) 422 idxv = _mm_cvttps_epi32(tempv); 423 // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code 424 return _mm_setr_ps(data[_mm_cvtsi128_si32(idxv)], 425 data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)))], 426 data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)))], 427 data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)))]); 428 } 429 #endif 430 #endif 431 432 // use with float indices 433 template<typename U = T, typename V, typename = typename std::enable_if<std::is_floating_point<V>::value && std::is_same<U, float>::value>::type> 434 T operator[](V index) const 435 { 436 int idx = (int)index; // don't use floor! The difference in negative space is no problems here 437 438 if (index < 0.f) { 439 if (clip & LUT_CLIP_BELOW) { 440 return data[0]; 441 } 442 443 idx = 0; 444 } else if (index > maxsf) { 445 if (clip & LUT_CLIP_ABOVE) { 446 return data[upperBound]; 447 } 448 449 idx = maxs; 450 } 451 452 float diff = index - (float) idx; 453 T p1 = data[idx]; 454 T p2 = data[idx + 1] - p1; 455 return (p1 + p2 * diff); 456 } 457 458 // Return the value for "index" that is in the [0-1] range. 459 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> getVal01(float index)460 T getVal01 (float index) const 461 { 462 index *= (float)upperBound; 463 int idx = (int)index; // don't use floor! The difference in negative space is no problems here 464 465 if (index < 0.f) { 466 if (clip & LUT_CLIP_BELOW) { 467 return data[0]; 468 } 469 470 idx = 0; 471 } else if (index > maxsf) { 472 if (clip & LUT_CLIP_ABOVE) { 473 return data[upperBound]; 474 } 475 476 idx = maxs; 477 } 478 479 float diff = index - (float) idx; 480 T p1 = data[idx]; 481 T p2 = data[idx + 1] - p1; 482 return (p1 + p2 * diff); 483 } 484 485 operator bool (void) const 486 { 487 return size > 0; 488 } 489 clear(void)490 void clear(void) 491 { 492 if (data && size) { 493 memset(data, 0, size * sizeof(T)); 494 } 495 } 496 reset(void)497 void reset(void) 498 { 499 if (data) { 500 delete[] data; 501 } 502 503 dirty = true; 504 data = nullptr; 505 owner = 1; 506 size = 0; 507 upperBound = 0; 508 maxs = 0; 509 maxsf = 0.f; 510 clip = 0; 511 } 512 513 // create an identity LUT (LUT(x) = x) or a scaled identity LUT (LUT(x) = x / divisor) 514 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 515 void makeIdentity(float divisor = 1.f) 516 { 517 if(divisor == 1.f) { 518 for(unsigned int i = 0; i < size; i++) { 519 data[i] = i; 520 } 521 } else { 522 for(unsigned int i = 0; i < size; i++) { 523 data[i] = i / divisor; 524 } 525 } 526 } 527 528 // compress a LUT<uint32_t> with size y into a LUT<uint32_t> with size x (y>x) 529 template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type> 530 void compressTo(LUT<T> &dest, unsigned int numVals = 0) const 531 { 532 numVals = numVals == 0 ? size : numVals; 533 numVals = std::min(numVals, size); 534 float divisor = numVals - 1; 535 float mult = (dest.size - 1) / divisor; 536 537 for (unsigned int i = 0; i < numVals; i++) { 538 int hi = (int)(mult * i); 539 dest.data[hi] += this->data[i] ; 540 } 541 } 542 543 // compress a LUT<uint32_t> with size y into a LUT<uint32_t> with size x (y>x) by using the passThrough LUT to calculate indexes 544 template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type> compressTo(LUT<T> & dest,unsigned int numVals,const LUT<float> & passThrough)545 void compressTo(LUT<T> &dest, unsigned int numVals, const LUT<float> &passThrough) const 546 { 547 if(passThrough) { 548 numVals = std::min(numVals, size); 549 numVals = std::min(numVals, passThrough.getSize()); 550 float mult = dest.size - 1; 551 552 for (unsigned int i = 0; i < numVals; i++) { 553 int hi = (int)(mult * passThrough[i]); 554 dest[hi] += this->data[i] ; 555 } 556 } 557 } 558 559 // compute sum and average of a LUT<uint32_t> 560 template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type> getSumAndAverage(float & sum,float & avg)561 void getSumAndAverage(float &sum, float &avg) const 562 { 563 sum = 0.f; 564 avg = 0.f; 565 int i = 0; 566 #ifdef __SSE2__ 567 vfloat iv = _mm_set_ps(3.f, 2.f, 1.f, 0.f); 568 vfloat fourv = F2V(4.f); 569 vint sumv = (vint)ZEROV; 570 vfloat avgv = ZEROV; 571 572 for(; i < static_cast<int>(size) - 3; i += 4) { 573 vint datav = _mm_loadu_si128((__m128i*)&data[i]); 574 sumv += datav; 575 avgv += iv * _mm_cvtepi32_ps(datav); 576 iv += fourv; 577 578 } 579 580 sum = vhadd(_mm_cvtepi32_ps(sumv)); 581 avg = vhadd(avgv); 582 #endif 583 584 for (; i < static_cast<int>(size); i++) { 585 T val = data[i]; 586 sum += val; 587 avg += i * val; 588 } 589 590 avg /= sum; 591 } 592 593 594 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 595 void makeConstant(float value, unsigned int numVals = 0) 596 { 597 numVals = numVals == 0 ? size : numVals; 598 numVals = std::min(numVals, size); 599 600 for(unsigned int i = 0; i < numVals; i++) { 601 data[i] = value; 602 } 603 } 604 605 // share the buffer with another LUT, handy for same data but different clip flags 606 void share(const LUT<T> &source, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE) 607 { 608 if (owner && data) { 609 delete[] data; 610 } 611 612 dirty = false; // Assumption 613 clip = flags; 614 data = source.data; 615 owner = 0; 616 size = source.getSize(); 617 upperBound = size - 1; 618 maxs = size - 2; 619 maxsf = (float)maxs; 620 #ifdef __SSE2__ 621 maxsv = F2V( size - 2); 622 sizeiv = _mm_set1_epi32( (int)(size - 1) ); 623 sizev = F2V( size - 1 ); 624 #endif 625 } 626 627 628 }; 629