1 /* 2 * LUT.h 3 * This file is part of RawTherapee. 4 * 5 * Copyright (c) 2011 Jan Rinze Peterzon (janrinze@gmail.com) 6 * 7 * RawTherapee is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * RawTherapee is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with RawTherapee. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 21 /* 22 * Declaration of flexible Lookup Tables 23 * 24 * Usage: 25 * 26 * LUT<type> name (size); 27 * LUT<type> name (size, flags); 28 * 29 * creates an array which is valid within the normal C/C++ scope "{ ... }" 30 * 31 * access to elements is a simple as: 32 * 33 * LUT<float> my_lut (10); 34 * float value = my_lut[3]; 35 * float value = my_lut[2.5]; // this will interpolate 36 * 37 * when using a float type index it will interpolate the lookup values 38 * 39 * extra setting in flags: (clipping is set by default) 40 * LUT_CLIP_ABOVE 41 * LUT_CLIP_BELOW 42 * 43 * example: 44 * LUT<float> my_lut (10,LUT_CLIP_BELOW); 45 * float value = my_lut[22.5]; // this will extrapolate 46 * float value = my_lut[-22.5]; // this will not extrapolate 47 * 48 * LUT<float> my_lut (10,0); // this will extrapolate on either side 49 * 50 * shotcuts: 51 * 52 * LUTf stands for LUT<float> 53 * LUTi stands for LUT<int> 54 * LUTu stands for LUT<unsigned int> 55 * LUTd stands for LUT<double> 56 * LUTuc stands for LUT<unsigned char> 57 */ 58 59 #ifndef LUT_H_ 60 #define LUT_H_ 61 62 #include <cstring> 63 #include <cstdint> 64 #ifndef NDEBUG 65 #include <cassert> 66 #endif 67 #include "opthelper.h" 68 #include "rt_math.h" 69 70 // Bit representations of flags 71 enum { 72 LUT_CLIP_OFF, // LUT does not clip input values 73 LUT_CLIP_BELOW, // LUT clips input values at lower bound 74 LUT_CLIP_ABOVE // LUT clips input values at upper bound 75 }; 76 77 template<typename T> 78 class LUT; 79 80 using LUTf = LUT<float>; 81 using LUTi = LUT<int32_t>; 82 using LUTu = LUT<uint32_t>; 83 using LUTd = LUT<double>; 84 using LUTuc = LUT<uint8_t>; 85 86 template<typename T> 87 class LUT 88 { 89 protected: 90 // list of variables ordered to improve cache speed 91 int maxs; 92 float maxsf; 93 T * data; 94 unsigned int clip; 95 unsigned int size; 96 unsigned int upperBound; // always equals size-1, parameter created for performance reason 97 private: 98 unsigned int owner; 99 #ifdef __SSE2__ 100 alignas(16) vfloat maxsv; 101 alignas(16) vfloat sizev; 102 alignas(16) vint sizeiv; 103 #endif 104 public: 105 /// convenience flag! If one doesn't want to delete the buffer but want to flag it to be recomputed... 106 /// The user have to handle it itself, even if some method can (re)initialize it 107 bool dirty; 108 109 LUT(int s, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE, bool initZero = false) 110 { 111 #ifndef NDEBUG 112 113 if (s <= 0) { 114 printf("s<=0!\n"); 115 } 116 117 assert (s > 0); 118 #endif 119 dirty = true; 120 clip = flags; 121 // Add a few extra elements so [](vfloat) won't access out-of-bounds memory. 122 // The routine would still produce the right answer, but might cause issues 123 // with address/heap checking programs. 124 data = new T[s + 3]; 125 owner = 1; 126 size = s; 127 upperBound = size - 1; 128 maxs = size - 2; 129 maxsf = (float)maxs; 130 #ifdef __SSE2__ 131 maxsv = F2V( maxs ); 132 sizeiv = _mm_set1_epi32( (int)(size - 1) ); 133 sizev = F2V( size - 1 ); 134 #endif 135 if (initZero) { 136 clear(); 137 } 138 } operator()139 void operator ()(int s, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE, bool initZero = false) 140 { 141 #ifndef NDEBUG 142 143 if (s <= 0) { 144 printf("s<=0!\n"); 145 } 146 147 assert (s > 0); 148 #endif 149 150 if (owner && data) { 151 delete[] data; 152 } 153 154 dirty = true; // Assumption! 155 clip = flags; 156 // See comment in constructor. 157 data = new T[s + 3]; 158 owner = 1; 159 size = s; 160 upperBound = size - 1; 161 maxs = size - 2; 162 maxsf = (float)maxs; 163 #ifdef __SSE2__ 164 maxsv = F2V( maxs ); 165 sizeiv = _mm_set1_epi32( (int)(size - 1) ); 166 sizev = F2V( size - 1 ); 167 #endif 168 if (initZero) { 169 clear(); 170 } 171 172 } 173 LUT()174 LUT() 175 { 176 data = nullptr; 177 reset(); 178 #ifdef __SSE2__ 179 maxsv = ZEROV; 180 sizev = ZEROV; 181 sizeiv = _mm_setzero_si128(); 182 #endif 183 } 184 ~LUT()185 ~LUT() 186 { 187 if (owner) { 188 delete[] data; 189 #ifndef NDEBUG 190 data = (T*)0xBAADF00D; 191 #endif 192 } 193 } 194 setClip(int flags)195 void setClip(int flags) 196 { 197 clip = flags; 198 } 199 getClip()200 int getClip() const { 201 return clip; 202 } 203 204 /** @brief Get the number of element in the LUT (i.e. dimension of the array) 205 * For a LUT(500), it will return 500 206 * @return number of element in the array 207 */ getSize()208 unsigned int getSize() const 209 { 210 return size; 211 } 212 213 /** @brief Get the highest value possible (i.e. dimension of the array) 214 * For a LUT(500), it will return 499, because 500 elements, starting from 0, goes up to 499 215 * @return number of element in the array 216 */ getUpperBound()217 unsigned int getUpperBound() const 218 { 219 return size > 0 ? upperBound : 0; 220 } 221 222 LUT<T> & operator=(LUT<T> &rhs) 223 { 224 if (this != &rhs) { 225 if (rhs.size > this->size) { 226 delete [] this->data; 227 this->data = nullptr; 228 } 229 230 if (this->data == nullptr) { 231 // See comment in constructor. 232 this->data = new T[rhs.size + 3]; 233 } 234 235 this->clip = rhs.clip; 236 this->owner = 1; 237 memcpy(this->data, rhs.data, rhs.size * sizeof(T)); 238 this->size = rhs.size; 239 this->upperBound = rhs.upperBound; 240 this->maxs = this->size - 2; 241 this->maxsf = (float)this->maxs; 242 #ifdef __SSE2__ 243 this->maxsv = F2V( this->size - 2); 244 this->sizeiv = _mm_set1_epi32( (int)(this->size - 1) ); 245 this->sizev = F2V( this->size - 1 ); 246 #endif 247 } 248 249 return *this; 250 } 251 252 // handy to sum up per thread histograms. #pragma omp simd speeds up the loop by about factor 3 for LUTu (uint32_t). 253 template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type> 254 LUT<T> & operator+=(LUT<T> &rhs) 255 { 256 if (rhs.size == this->size) { 257 #ifdef _OPENMP 258 #pragma omp simd 259 #endif 260 261 for(unsigned int i = 0; i < this->size; i++) { 262 data[i] += rhs.data[i]; 263 } 264 } 265 266 return *this; 267 } 268 269 // multiply all elements of LUT<float> with a constant float value 270 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 271 LUT<float> & operator*=(float factor) 272 { 273 #ifdef _OPENMP 274 #pragma omp simd 275 #endif 276 277 for(unsigned int i = 0; i < this->size; i++) { 278 data[i] *= factor; 279 } 280 281 return *this; 282 } 283 284 // divide all elements of LUT<float> by a constant float value 285 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 286 LUT<float> & operator/=(float divisor) 287 { 288 #ifdef _OPENMP 289 #pragma omp simd 290 #endif 291 292 for(unsigned int i = 0; i < this->size; i++) { 293 data[i] /= divisor; 294 } 295 296 return *this; 297 } 298 299 300 // use with integer indices 301 T& operator[](int index) const 302 { 303 return data[ librtprocess::LIM<int>(index, 0, upperBound) ]; 304 } 305 306 #ifdef __SSE2__ 307 308 309 // NOTE: This function requires LUTs which clips only at lower bound cb(vfloat indexv)310 vfloat cb(vfloat indexv) const 311 { 312 static_assert(std::is_same<T, float>::value, "This method only works for float LUTs"); 313 314 // Clamp and convert to integer values. Extract out of SSE register because all 315 // lookup operations use regular addresses. 316 vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv)); 317 vint indexes = _mm_cvttps_epi32(clampedIndexes); 318 int indexArray[4]; 319 _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes); 320 321 // Load data from the table. This reads more than necessary, but there don't seem 322 // to exist more granular operations (though we could try non-SSE). 323 // Cast to int for convenience in the next operation (partial transpose). 324 vint values[4]; 325 for (int i = 0; i < 4; ++i) { 326 values[i] = _mm_castps_si128(LVFU(data[indexArray[i]])); 327 } 328 329 // Partial 4x4 transpose operation. We want two new vectors, the first consisting 330 // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]]. 331 __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]); 332 __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]); 333 vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1)); 334 vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1)); 335 336 vfloat diff = vmaxf(ZEROV, indexv) - _mm_cvtepi32_ps(indexes); 337 return vintpf(diff, upper, lower); 338 } 339 340 // NOTE: This version requires LUTs which clip at upper and lower bounds 341 // (which is the default). 342 vfloat operator[](vfloat indexv) const 343 { 344 static_assert(std::is_same<T, float>::value, "This method only works for float LUTs"); 345 346 // Clamp and convert to integer values. Extract out of SSE register because all 347 // lookup operations use regular addresses. 348 vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv)); 349 vint indexes = _mm_cvttps_epi32(clampedIndexes); 350 int indexArray[4]; 351 _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes); 352 353 // Load data from the table. This reads more than necessary, but there don't seem 354 // to exist more granular operations (though we could try non-SSE). 355 // Cast to int for convenience in the next operation (partial transpose). 356 vint values[4]; 357 for (int i = 0; i < 4; ++i) { 358 values[i] = _mm_castps_si128(LVFU(data[indexArray[i]])); 359 } 360 361 // Partial 4x4 transpose operation. We want two new vectors, the first consisting 362 // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]]. 363 __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]); 364 __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]); 365 vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1)); 366 vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1)); 367 368 vfloat diff = vmaxf(ZEROV, vminf(sizev, indexv)) - _mm_cvtepi32_ps(indexes); 369 return vintpf(diff, upper, lower); 370 } 371 372 // NOTE: This version requires LUTs which do not clip at upper and lower bounds operator()373 vfloat operator()(vfloat indexv) const 374 { 375 static_assert(std::is_same<T, float>::value, "This method only works for float LUTs"); 376 377 // Clamp and convert to integer values. Extract out of SSE register because all 378 // lookup operations use regular addresses. 379 vfloat clampedIndexes = vmaxf(ZEROV, vminf(maxsv, indexv)); 380 vint indexes = _mm_cvttps_epi32(clampedIndexes); 381 int indexArray[4]; 382 _mm_storeu_si128(reinterpret_cast<__m128i*>(&indexArray[0]), indexes); 383 384 // Load data from the table. This reads more than necessary, but there don't seem 385 // to exist more granular operations (though we could try non-SSE). 386 // Cast to int for convenience in the next operation (partial transpose). 387 vint values[4]; 388 for (int i = 0; i < 4; ++i) { 389 values[i] = _mm_castps_si128(LVFU(data[indexArray[i]])); 390 } 391 392 // Partial 4x4 transpose operation. We want two new vectors, the first consisting 393 // of [values[0][0] ... values[3][0]] and the second [values[0][1] ... values[3][1]]. 394 __m128i temp0 = _mm_unpacklo_epi32(values[0], values[1]); 395 __m128i temp1 = _mm_unpacklo_epi32(values[2], values[3]); 396 vfloat lower = _mm_castsi128_ps(_mm_unpacklo_epi64(temp0, temp1)); 397 vfloat upper = _mm_castsi128_ps(_mm_unpackhi_epi64(temp0, temp1)); 398 399 vfloat diff = indexv - _mm_cvtepi32_ps(indexes); 400 return vintpf(diff, upper, lower); 401 } 402 403 // vectorized LUT access with integer indices. Clips at lower and upper bounds 404 #ifdef __SSE4_1__ 405 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 406 vfloat operator[](vint idxv) const 407 { 408 idxv = _mm_max_epi32( _mm_setzero_si128(), _mm_min_epi32(idxv, sizeiv)); 409 // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code 410 return _mm_setr_ps(data[_mm_extract_epi32(idxv,0)], data[_mm_extract_epi32(idxv,1)], data[_mm_extract_epi32(idxv,2)], data[_mm_extract_epi32(idxv,3)]); 411 } 412 #else 413 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 414 vfloat operator[](vint idxv) const 415 { 416 vfloat tempv = vmaxf(ZEROV, vminf(sizev, _mm_cvtepi32_ps(idxv))); // convert to float because SSE2 has no min/max for 32bit integers 417 idxv = _mm_cvttps_epi32(tempv); 418 // access the LUT 4 times. Trust the compiler. It generates good code here, better than hand written SSE code 419 return _mm_setr_ps(data[_mm_cvtsi128_si32(idxv)], 420 data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(1, 1, 1, 1)))], 421 data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(2, 2, 2, 2)))], 422 data[_mm_cvtsi128_si32(_mm_shuffle_epi32(idxv, _MM_SHUFFLE(3, 3, 3, 3)))]); 423 } 424 #endif 425 #endif 426 427 // use with float indices 428 template<typename U = T, typename V, typename = typename std::enable_if<std::is_floating_point<V>::value && std::is_same<U, float>::value>::type> 429 T operator[](V index) const 430 { 431 int idx = (int)index; // don't use floor! The difference in negative space is no problems here 432 433 if (index < 0.f) { 434 if (clip & LUT_CLIP_BELOW) { 435 return data[0]; 436 } 437 438 idx = 0; 439 } else if (idx > maxs) { 440 if (clip & LUT_CLIP_ABOVE) { 441 return data[upperBound]; 442 } 443 444 idx = maxs; 445 } 446 447 float diff = index - (float) idx; 448 T p1 = data[idx]; 449 T p2 = data[idx + 1] - p1; 450 return (p1 + p2 * diff); 451 } 452 453 // Return the value for "index" that is in the [0-1] range. 454 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> getVal01(float index)455 T getVal01 (float index) const 456 { 457 index *= (float)upperBound; 458 int idx = (int)index; // don't use floor! The difference in negative space is no problems here 459 460 if (index < 0.f) { 461 if (clip & LUT_CLIP_BELOW) { 462 return data[0]; 463 } 464 465 idx = 0; 466 } else if (index > maxsf) { 467 if (clip & LUT_CLIP_ABOVE) { 468 return data[upperBound]; 469 } 470 471 idx = maxs; 472 } 473 474 float diff = index - (float) idx; 475 T p1 = data[idx]; 476 T p2 = data[idx + 1] - p1; 477 return (p1 + p2 * diff); 478 } 479 480 operator bool (void) const 481 { 482 return size > 0; 483 } 484 clear(void)485 void clear(void) 486 { 487 if (data && size) { 488 memset(data, 0, size * sizeof(T)); 489 } 490 } 491 reset(void)492 void reset(void) 493 { 494 if (data) { 495 delete[] data; 496 } 497 498 dirty = true; 499 data = nullptr; 500 owner = 1; 501 size = 0; 502 upperBound = 0; 503 maxs = 0; 504 maxsf = 0.f; 505 clip = 0; 506 } 507 508 // create an identity LUT (LUT(x) = x) or a scaled identity LUT (LUT(x) = x / divisor) 509 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 510 void makeIdentity(float divisor = 1.f) 511 { 512 if(divisor == 1.f) { 513 for(unsigned int i = 0; i < size; i++) { 514 data[i] = i; 515 } 516 } else { 517 for(unsigned int i = 0; i < size; i++) { 518 data[i] = i / divisor; 519 } 520 } 521 } 522 523 // compress a LUT<uint32_t> with size y into a LUT<uint32_t> with size x (y>x) 524 template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type> 525 void compressTo(LUT<T> &dest, unsigned int numVals = 0) const 526 { 527 numVals = numVals == 0 ? size : numVals; 528 numVals = std::min(numVals, size); 529 float divisor = numVals - 1; 530 float mult = (dest.size - 1) / divisor; 531 532 for (unsigned int i = 0; i < numVals; i++) { 533 int hi = (int)(mult * i); 534 dest.data[hi] += this->data[i] ; 535 } 536 } 537 538 // compress a LUT<uint32_t> with size y into a LUT<uint32_t> with size x (y>x) by using the passTrough LUT to calculate indexes 539 template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type> compressTo(LUT<T> & dest,unsigned int numVals,const LUT<float> & passThrough)540 void compressTo(LUT<T> &dest, unsigned int numVals, const LUT<float> &passThrough) const 541 { 542 if(passThrough) { 543 numVals = std::min(numVals, size); 544 numVals = std::min(numVals, passThrough.getSize()); 545 float mult = dest.size - 1; 546 547 for (unsigned int i = 0; i < numVals; i++) { 548 int hi = (int)(mult * passThrough[i]); 549 dest[hi] += this->data[i] ; 550 } 551 } 552 } 553 554 // compute sum and average of a LUT<uint32_t> 555 template<typename U = T, typename = typename std::enable_if<std::is_same<U, std::uint32_t>::value>::type> getSumAndAverage(float & sum,float & avg)556 void getSumAndAverage(float &sum, float &avg) const 557 { 558 sum = 0.f; 559 avg = 0.f; 560 int i = 0; 561 #ifdef __SSE2__ 562 vfloat iv = _mm_set_ps(3.f, 2.f, 1.f, 0.f); 563 vfloat fourv = F2V(4.f); 564 vint sumv = (vint)ZEROV; 565 vfloat avgv = ZEROV; 566 567 for(; i < static_cast<int>(size) - 3; i += 4) { 568 vint datav = _mm_loadu_si128((__m128i*)&data[i]); 569 sumv += datav; 570 avgv += iv * _mm_cvtepi32_ps(datav); 571 iv += fourv; 572 573 } 574 575 sum = vhadd(_mm_cvtepi32_ps(sumv)); 576 avg = vhadd(avgv); 577 #endif 578 579 for (; i < static_cast<int>(size); i++) { 580 T val = data[i]; 581 sum += val; 582 avg += i * val; 583 } 584 585 avg /= sum; 586 } 587 588 589 template<typename U = T, typename = typename std::enable_if<std::is_same<U, float>::value>::type> 590 void makeConstant(float value, unsigned int numVals = 0) 591 { 592 numVals = numVals == 0 ? size : numVals; 593 numVals = std::min(numVals, size); 594 595 for(unsigned int i = 0; i < numVals; i++) { 596 data[i] = value; 597 } 598 } 599 600 // share the buffer with another LUT, handy for same data but different clip flags 601 void share(const LUT<T> &source, int flags = LUT_CLIP_BELOW | LUT_CLIP_ABOVE) 602 { 603 if (owner && data) { 604 delete[] data; 605 } 606 607 dirty = false; // Assumption 608 clip = flags; 609 data = source.data; 610 owner = 0; 611 size = source.getSize(); 612 upperBound = size - 1; 613 maxs = size - 2; 614 maxsf = (float)maxs; 615 #ifdef __SSE2__ 616 maxsv = F2V( size - 2); 617 sizeiv = _mm_set1_epi32( (int)(size - 1) ); 618 sizev = F2V( size - 1 ); 619 #endif 620 } 621 622 623 }; 624 625 #endif /* LUT_H_ */ 626