1 /* 2 Convection Texture Tools 3 Copyright (c) 2018 Eric Lasota 4 5 Permission is hereby granted, free of charge, to any person obtaining 6 a copy of this software and associated documentation files (the 7 "Software"), to deal in the Software without restriction, including 8 without limitation the rights to use, copy, modify, merge, publish, 9 distribute, sublicense, and/or sell copies of the Software, and to 10 permit persons to whom the Software is furnished to do so, subject 11 to the following conditions: 12 13 The above copyright notice and this permission notice shall be included 14 in all copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 17 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 24 ------------------------------------------------------------------------------------- 25 26 Portions based on DirectX Texture Library (DirectXTex) 27 28 Copyright (c) Microsoft Corporation. All rights reserved. 29 Licensed under the MIT License. 30 31 http://go.microsoft.com/fwlink/?LinkId=248926 32 */ 33 #include "ConvectionKernels.h" 34 #include "ConvectionKernels_BC7_SingleColor.h" 35 36 #if (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(_M_X64) || defined(__SSE2__) 37 #define CVTT_USE_SSE2 38 #endif 39 40 #ifdef CVTT_USE_SSE2 41 #include <emmintrin.h> 42 #endif 43 44 #include <float.h> 45 #include <assert.h> 46 #include <string.h> 47 #include <algorithm> 48 #include <math.h> 49 50 #define UNREFERENCED_PARAMETER(n) ((void)n) 51 52 namespace cvtt 53 { 54 #ifdef CVTT_USE_SSE2 55 // SSE2 version 56 struct ParallelMath 57 { 58 typedef uint16_t ScalarUInt16; 59 typedef int16_t ScalarSInt16; 60 61 template<unsigned int TRoundingMode> 62 struct RoundForScope 63 { 64 unsigned int m_oldCSR; 65 RoundForScopecvtt::ParallelMath::RoundForScope66 RoundForScope() 67 { 68 m_oldCSR = _mm_getcsr(); 69 _mm_setcsr((m_oldCSR & ~_MM_ROUND_MASK) | (TRoundingMode)); 70 } 71 ~RoundForScopecvtt::ParallelMath::RoundForScope72 ~RoundForScope() 73 { 74 _mm_setcsr(m_oldCSR); 75 } 76 }; 77 78 struct RoundTowardZeroForScope : RoundForScope<_MM_ROUND_TOWARD_ZERO> 79 { 80 }; 81 82 struct RoundTowardNearestForScope : RoundForScope<_MM_ROUND_NEAREST> 83 { 84 }; 85 86 struct RoundUpForScope : RoundForScope<_MM_ROUND_UP> 87 { 88 }; 89 90 struct RoundDownForScope : RoundForScope<_MM_ROUND_DOWN> 91 { 92 }; 93 94 static const int ParallelSize = 8; 95 96 enum Int16Subtype 97 { 98 IntSubtype_Signed, 99 IntSubtype_UnsignedFull, 100 IntSubtype_UnsignedTruncated, 101 IntSubtype_Abstract, 102 }; 103 104 template<int TSubtype> 105 struct VInt16 106 { 107 __m128i m_value; 108 operator +cvtt::ParallelMath::VInt16109 inline VInt16 operator+(int16_t other) const 110 { 111 VInt16 result; 112 result.m_value = _mm_add_epi16(m_value, _mm_set1_epi16(static_cast<int16_t>(other))); 113 return result; 114 } 115 operator +cvtt::ParallelMath::VInt16116 inline VInt16 operator+(const VInt16 &other) const 117 { 118 VInt16 result; 119 result.m_value = _mm_add_epi16(m_value, other.m_value); 120 return result; 121 } 122 operator |cvtt::ParallelMath::VInt16123 inline VInt16 operator|(const VInt16 &other) const 124 { 125 VInt16 result; 126 result.m_value = _mm_or_si128(m_value, other.m_value); 127 return result; 128 } 129 operator &cvtt::ParallelMath::VInt16130 inline VInt16 operator&(const VInt16 &other) const 131 { 132 VInt16 result; 133 result.m_value = _mm_and_si128(m_value, other.m_value); 134 return result; 135 } 136 operator -cvtt::ParallelMath::VInt16137 inline VInt16 operator-(const VInt16 &other) const 138 { 139 VInt16 result; 140 result.m_value = _mm_sub_epi16(m_value, other.m_value); 141 return result; 142 } 143 operator <<cvtt::ParallelMath::VInt16144 inline VInt16 operator<<(int bits) const 145 { 146 VInt16 result; 147 result.m_value = _mm_slli_epi16(m_value, bits); 148 return result; 149 } 150 }; 151 152 typedef VInt16<IntSubtype_Signed> SInt16; 153 typedef VInt16<IntSubtype_UnsignedFull> UInt16; 154 typedef VInt16<IntSubtype_UnsignedTruncated> UInt15; 155 typedef VInt16<IntSubtype_Abstract> AInt16; 156 157 template<int TSubtype> 158 struct VInt32 159 { 160 __m128i m_values[2]; 161 operator +cvtt::ParallelMath::VInt32162 inline VInt32 operator+(const VInt32& other) const 163 { 164 VInt32 result; 165 result.m_values[0] = _mm_add_epi32(m_values[0], other.m_values[0]); 166 result.m_values[1] = _mm_add_epi32(m_values[1], other.m_values[1]); 167 return result; 168 } 169 operator -cvtt::ParallelMath::VInt32170 inline VInt32 operator-(const VInt32& other) const 171 { 172 VInt32 result; 173 result.m_values[0] = _mm_sub_epi32(m_values[0], other.m_values[0]); 174 result.m_values[1] = _mm_sub_epi32(m_values[1], other.m_values[1]); 175 return result; 176 } 177 operator <<cvtt::ParallelMath::VInt32178 inline VInt32 operator<<(const int other) const 179 { 180 VInt32 result; 181 result.m_values[0] = _mm_slli_epi32(m_values[0], other); 182 result.m_values[1] = _mm_slli_epi32(m_values[1], other); 183 return result; 184 } 185 }; 186 187 typedef VInt32<IntSubtype_Signed> SInt32; 188 typedef VInt32<IntSubtype_UnsignedTruncated> UInt31; 189 typedef VInt32<IntSubtype_UnsignedFull> UInt32; 190 typedef VInt32<IntSubtype_Abstract> AInt32; 191 192 template<class TTargetType> 193 struct LosslessCast 194 { 195 #ifdef CVTT_PERMIT_ALIASING 196 template<int TSrcSubtype> Castcvtt::ParallelMath::LosslessCast197 static const TTargetType& Cast(const VInt32<TSrcSubtype> &src) 198 { 199 return reinterpret_cast<VInt32<TSubtype>&>(src); 200 } 201 202 template<int TSrcSubtype> Castcvtt::ParallelMath::LosslessCast203 static const TTargetType& Cast(const VInt16<TSrcSubtype> &src) 204 { 205 return reinterpret_cast<VInt16<TSubtype>&>(src); 206 } 207 #else 208 template<int TSrcSubtype> 209 static TTargetType Cast(const VInt32<TSrcSubtype> &src) 210 { 211 TTargetType result; 212 result.m_values[0] = src.m_values[0]; 213 result.m_values[1] = src.m_values[1]; 214 return result; 215 } 216 217 template<int TSrcSubtype> 218 static TTargetType Cast(const VInt16<TSrcSubtype> &src) 219 { 220 TTargetType result; 221 result.m_value = src.m_value; 222 return result; 223 } 224 #endif 225 }; 226 227 struct Int64 228 { 229 __m128i m_values[4]; 230 }; 231 232 struct Float 233 { 234 __m128 m_values[2]; 235 operator +cvtt::ParallelMath::Float236 inline Float operator+(const Float &other) const 237 { 238 Float result; 239 result.m_values[0] = _mm_add_ps(m_values[0], other.m_values[0]); 240 result.m_values[1] = _mm_add_ps(m_values[1], other.m_values[1]); 241 return result; 242 } 243 operator +cvtt::ParallelMath::Float244 inline Float operator+(float other) const 245 { 246 Float result; 247 result.m_values[0] = _mm_add_ps(m_values[0], _mm_set1_ps(other)); 248 result.m_values[1] = _mm_add_ps(m_values[1], _mm_set1_ps(other)); 249 return result; 250 } 251 operator -cvtt::ParallelMath::Float252 inline Float operator-(const Float& other) const 253 { 254 Float result; 255 result.m_values[0] = _mm_sub_ps(m_values[0], other.m_values[0]); 256 result.m_values[1] = _mm_sub_ps(m_values[1], other.m_values[1]); 257 return result; 258 } 259 operator -cvtt::ParallelMath::Float260 inline Float operator-() const 261 { 262 Float result; 263 result.m_values[0] = _mm_sub_ps(_mm_setzero_ps(), m_values[0]); 264 result.m_values[1] = _mm_sub_ps(_mm_setzero_ps(), m_values[1]); 265 return result; 266 } 267 operator *cvtt::ParallelMath::Float268 inline Float operator*(const Float& other) const 269 { 270 Float result; 271 result.m_values[0] = _mm_mul_ps(m_values[0], other.m_values[0]); 272 result.m_values[1] = _mm_mul_ps(m_values[1], other.m_values[1]); 273 return result; 274 } 275 operator *cvtt::ParallelMath::Float276 inline Float operator*(float other) const 277 { 278 Float result; 279 result.m_values[0] = _mm_mul_ps(m_values[0], _mm_set1_ps(other)); 280 result.m_values[1] = _mm_mul_ps(m_values[1], _mm_set1_ps(other)); 281 return result; 282 } 283 operator /cvtt::ParallelMath::Float284 inline Float operator/(const Float &other) const 285 { 286 Float result; 287 result.m_values[0] = _mm_div_ps(m_values[0], other.m_values[0]); 288 result.m_values[1] = _mm_div_ps(m_values[1], other.m_values[1]); 289 return result; 290 } 291 operator /cvtt::ParallelMath::Float292 inline Float operator/(float other) const 293 { 294 Float result; 295 result.m_values[0] = _mm_div_ps(m_values[0], _mm_set1_ps(other)); 296 result.m_values[1] = _mm_div_ps(m_values[1], _mm_set1_ps(other)); 297 return result; 298 } 299 }; 300 301 struct Int16CompFlag 302 { 303 __m128i m_value; 304 operator &cvtt::ParallelMath::Int16CompFlag305 inline Int16CompFlag operator&(const Int16CompFlag &other) const 306 { 307 Int16CompFlag result; 308 result.m_value = _mm_and_si128(m_value, other.m_value); 309 return result; 310 } 311 operator |cvtt::ParallelMath::Int16CompFlag312 inline Int16CompFlag operator|(const Int16CompFlag &other) const 313 { 314 Int16CompFlag result; 315 result.m_value = _mm_or_si128(m_value, other.m_value); 316 return result; 317 } 318 }; 319 320 struct FloatCompFlag 321 { 322 __m128 m_values[2]; 323 }; 324 325 template<int TSubtype> AbstractAddcvtt::ParallelMath326 static VInt16<TSubtype> AbstractAdd(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b) 327 { 328 VInt16<TSubtype> result; 329 result.m_value = _mm_add_epi16(a.m_value, b.m_value); 330 return result; 331 } 332 333 template<int TSubtype> AbstractSubtractcvtt::ParallelMath334 static VInt16<TSubtype> AbstractSubtract(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b) 335 { 336 VInt16<TSubtype> result; 337 result.m_value = _mm_sub_epi16(a.m_value, b.m_value); 338 return result; 339 } 340 Selectcvtt::ParallelMath341 static Float Select(const FloatCompFlag &flag, const Float &a, const Float &b) 342 { 343 Float result; 344 for (int i = 0; i < 2; i++) 345 result.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], a.m_values[i]), _mm_andnot_ps(flag.m_values[i], b.m_values[i])); 346 return result; 347 } 348 349 template<int TSubtype> Selectcvtt::ParallelMath350 static VInt16<TSubtype> Select(const Int16CompFlag &flag, const VInt16<TSubtype> &a, const VInt16<TSubtype> &b) 351 { 352 VInt16<TSubtype> result; 353 result.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, a.m_value), _mm_andnot_si128(flag.m_value, b.m_value)); 354 return result; 355 } 356 357 template<int TSubtype> SelectOrZerocvtt::ParallelMath358 static VInt16<TSubtype> SelectOrZero(const Int16CompFlag &flag, const VInt16<TSubtype> &a) 359 { 360 VInt16<TSubtype> result; 361 result.m_value = _mm_and_si128(flag.m_value, a.m_value); 362 return result; 363 } 364 365 template<int TSubtype> ConditionalSetcvtt::ParallelMath366 static void ConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src) 367 { 368 dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value)); 369 } 370 ConditionalNegatecvtt::ParallelMath371 static SInt16 ConditionalNegate(const Int16CompFlag &flag, const SInt16 &v) 372 { 373 SInt16 result; 374 result.m_value = _mm_add_epi16(_mm_xor_si128(flag.m_value, v.m_value), _mm_srli_epi16(flag.m_value, 15)); 375 return result; 376 } 377 378 template<int TSubtype> NotConditionalSetcvtt::ParallelMath379 static void NotConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src) 380 { 381 dest.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, dest.m_value), _mm_andnot_si128(flag.m_value, src.m_value)); 382 } 383 ConditionalSetcvtt::ParallelMath384 static void ConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src) 385 { 386 for (int i = 0; i < 2; i++) 387 dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i])); 388 } 389 NotConditionalSetcvtt::ParallelMath390 static void NotConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src) 391 { 392 for (int i = 0; i < 2; i++) 393 dest.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], dest.m_values[i]), _mm_andnot_ps(flag.m_values[i], src.m_values[i])); 394 } 395 MakeSafeDenominatorcvtt::ParallelMath396 static void MakeSafeDenominator(Float& v) 397 { 398 ConditionalSet(v, Equal(v, MakeFloatZero()), MakeFloat(1.0f)); 399 } 400 TruncateToPrecisionSignedcvtt::ParallelMath401 static SInt16 TruncateToPrecisionSigned(const SInt16 &v, int precision) 402 { 403 int lostBits = 16 - precision; 404 if (lostBits == 0) 405 return v; 406 407 SInt16 result; 408 result.m_value = _mm_srai_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits); 409 return result; 410 } 411 TruncateToPrecisionUnsignedcvtt::ParallelMath412 static UInt16 TruncateToPrecisionUnsigned(const UInt16 &v, int precision) 413 { 414 int lostBits = 16 - precision; 415 if (lostBits == 0) 416 return v; 417 418 UInt16 result; 419 result.m_value = _mm_srli_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits); 420 return result; 421 } 422 Mincvtt::ParallelMath423 static UInt16 Min(const UInt16 &a, const UInt16 &b) 424 { 425 __m128i bitFlip = _mm_set1_epi16(-32768); 426 427 UInt16 result; 428 result.m_value = _mm_xor_si128(_mm_min_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip); 429 return result; 430 } 431 Mincvtt::ParallelMath432 static SInt16 Min(const SInt16 &a, const SInt16 &b) 433 { 434 SInt16 result; 435 result.m_value = _mm_min_epi16(a.m_value, b.m_value); 436 return result; 437 } 438 Mincvtt::ParallelMath439 static UInt15 Min(const UInt15 &a, const UInt15 &b) 440 { 441 UInt15 result; 442 result.m_value = _mm_min_epi16(a.m_value, b.m_value); 443 return result; 444 } 445 Mincvtt::ParallelMath446 static Float Min(const Float &a, const Float &b) 447 { 448 Float result; 449 for (int i = 0; i < 2; i++) 450 result.m_values[i] = _mm_min_ps(a.m_values[i], b.m_values[i]); 451 return result; 452 } 453 Maxcvtt::ParallelMath454 static UInt16 Max(const UInt16 &a, const UInt16 &b) 455 { 456 __m128i bitFlip = _mm_set1_epi16(-32768); 457 458 UInt16 result; 459 result.m_value = _mm_xor_si128(_mm_max_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip); 460 return result; 461 } 462 Maxcvtt::ParallelMath463 static SInt16 Max(const SInt16 &a, const SInt16 &b) 464 { 465 SInt16 result; 466 result.m_value = _mm_max_epi16(a.m_value, b.m_value); 467 return result; 468 } 469 Maxcvtt::ParallelMath470 static UInt15 Max(const UInt15 &a, const UInt15 &b) 471 { 472 UInt15 result; 473 result.m_value = _mm_max_epi16(a.m_value, b.m_value); 474 return result; 475 } 476 Maxcvtt::ParallelMath477 static Float Max(const Float &a, const Float &b) 478 { 479 Float result; 480 for (int i = 0; i < 2; i++) 481 result.m_values[i] = _mm_max_ps(a.m_values[i], b.m_values[i]); 482 return result; 483 } 484 Clampcvtt::ParallelMath485 static Float Clamp(const Float &v, float min, float max) 486 { 487 Float result; 488 for (int i = 0; i < 2; i++) 489 result.m_values[i] = _mm_max_ps(_mm_min_ps(v.m_values[i], _mm_set1_ps(max)), _mm_set1_ps(min)); 490 return result; 491 } 492 Reciprocalcvtt::ParallelMath493 static Float Reciprocal(const Float &v) 494 { 495 Float result; 496 for (int i = 0; i < 2; i++) 497 result.m_values[i] = _mm_rcp_ps(v.m_values[i]); 498 return result; 499 } 500 ConvertLDRInputscvtt::ParallelMath501 static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, UInt15 &chOut) 502 { 503 int16_t values[8]; 504 for (int i = 0; i < 8; i++) 505 values[i] = inputBlocks[i].m_pixels[pxOffset][channel]; 506 507 chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]); 508 } 509 ConvertHDRInputscvtt::ParallelMath510 static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, SInt16 &chOut) 511 { 512 int16_t values[8]; 513 for (int i = 0; i < 8; i++) 514 values[i] = inputBlocks[i].m_pixels[pxOffset][channel]; 515 516 chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]); 517 } 518 MakeFloatcvtt::ParallelMath519 static Float MakeFloat(float v) 520 { 521 Float f; 522 f.m_values[0] = f.m_values[1] = _mm_set1_ps(v); 523 return f; 524 } 525 MakeFloatZerocvtt::ParallelMath526 static Float MakeFloatZero() 527 { 528 Float f; 529 f.m_values[0] = f.m_values[1] = _mm_setzero_ps(); 530 return f; 531 } 532 MakeUInt16cvtt::ParallelMath533 static UInt16 MakeUInt16(uint16_t v) 534 { 535 UInt16 result; 536 result.m_value = _mm_set1_epi16(static_cast<short>(v)); 537 return result; 538 } 539 MakeSInt16cvtt::ParallelMath540 static SInt16 MakeSInt16(int16_t v) 541 { 542 SInt16 result; 543 result.m_value = _mm_set1_epi16(static_cast<short>(v)); 544 return result; 545 } 546 MakeAInt16cvtt::ParallelMath547 static AInt16 MakeAInt16(int16_t v) 548 { 549 AInt16 result; 550 result.m_value = _mm_set1_epi16(static_cast<short>(v)); 551 return result; 552 } 553 MakeUInt15cvtt::ParallelMath554 static UInt15 MakeUInt15(uint16_t v) 555 { 556 UInt15 result; 557 result.m_value = _mm_set1_epi16(static_cast<short>(v)); 558 return result; 559 } 560 MakeSInt32cvtt::ParallelMath561 static SInt32 MakeSInt32(int32_t v) 562 { 563 SInt32 result; 564 result.m_values[0] = _mm_set1_epi32(v); 565 result.m_values[1] = _mm_set1_epi32(v); 566 return result; 567 } 568 MakeUInt31cvtt::ParallelMath569 static UInt31 MakeUInt31(uint32_t v) 570 { 571 UInt31 result; 572 result.m_values[0] = _mm_set1_epi32(v); 573 result.m_values[1] = _mm_set1_epi32(v); 574 return result; 575 } 576 Extractcvtt::ParallelMath577 static uint16_t Extract(const UInt16 &v, int offset) 578 { 579 return reinterpret_cast<const uint16_t*>(&v.m_value)[offset]; 580 } 581 Extractcvtt::ParallelMath582 static int16_t Extract(const SInt16 &v, int offset) 583 { 584 return reinterpret_cast<const int16_t*>(&v.m_value)[offset]; 585 } 586 Extractcvtt::ParallelMath587 static uint16_t Extract(const UInt15 &v, int offset) 588 { 589 return reinterpret_cast<const uint16_t*>(&v.m_value)[offset]; 590 } 591 Extractcvtt::ParallelMath592 static int16_t Extract(const AInt16 &v, int offset) 593 { 594 return reinterpret_cast<const int16_t*>(&v.m_value)[offset]; 595 } 596 PutUInt16cvtt::ParallelMath597 static void PutUInt16(UInt16 &dest, int offset, uint16_t v) 598 { 599 reinterpret_cast<uint16_t*>(&dest)[offset] = v; 600 } 601 PutUInt15cvtt::ParallelMath602 static void PutUInt15(UInt15 &dest, int offset, uint16_t v) 603 { 604 reinterpret_cast<uint16_t*>(&dest)[offset] = v; 605 } 606 PutSInt16cvtt::ParallelMath607 static void PutSInt16(SInt16 &dest, int offset, int16_t v) 608 { 609 reinterpret_cast<int16_t*>(&dest)[offset] = v; 610 } 611 ExtractFloatcvtt::ParallelMath612 static float ExtractFloat(const Float& v, int offset) 613 { 614 return reinterpret_cast<const float*>(&v)[offset]; 615 } 616 PutFloatcvtt::ParallelMath617 static void PutFloat(Float &dest, int offset, float v) 618 { 619 reinterpret_cast<float*>(&dest)[offset] = v; 620 } 621 Lesscvtt::ParallelMath622 static Int16CompFlag Less(const SInt16 &a, const SInt16 &b) 623 { 624 Int16CompFlag result; 625 result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value); 626 return result; 627 } 628 Lesscvtt::ParallelMath629 static Int16CompFlag Less(const UInt15 &a, const UInt15 &b) 630 { 631 Int16CompFlag result; 632 result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value); 633 return result; 634 } 635 LessOrEqualcvtt::ParallelMath636 static Int16CompFlag LessOrEqual(const UInt15 &a, const UInt15 &b) 637 { 638 Int16CompFlag result; 639 result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value); 640 return result; 641 } 642 Lesscvtt::ParallelMath643 static FloatCompFlag Less(const Float &a, const Float &b) 644 { 645 FloatCompFlag result; 646 for (int i = 0; i < 2; i++) 647 result.m_values[i] = _mm_cmplt_ps(a.m_values[i], b.m_values[i]); 648 return result; 649 } 650 LessOrEqualcvtt::ParallelMath651 static FloatCompFlag LessOrEqual(const Float &a, const Float &b) 652 { 653 FloatCompFlag result; 654 for (int i = 0; i < 2; i++) 655 result.m_values[i] = _mm_cmple_ps(a.m_values[i], b.m_values[i]); 656 return result; 657 } 658 659 template<int TSubtype> Equalcvtt::ParallelMath660 static Int16CompFlag Equal(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b) 661 { 662 Int16CompFlag result; 663 result.m_value = _mm_cmpeq_epi16(a.m_value, b.m_value); 664 return result; 665 } 666 Equalcvtt::ParallelMath667 static FloatCompFlag Equal(const Float &a, const Float &b) 668 { 669 FloatCompFlag result; 670 for (int i = 0; i < 2; i++) 671 result.m_values[i] = _mm_cmpeq_ps(a.m_values[i], b.m_values[i]); 672 return result; 673 } 674 ToFloatcvtt::ParallelMath675 static Float ToFloat(const UInt16 &v) 676 { 677 Float result; 678 result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128())); 679 result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128())); 680 return result; 681 } 682 ToUInt31cvtt::ParallelMath683 static UInt31 ToUInt31(const UInt16 &v) 684 { 685 UInt31 result; 686 result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()); 687 result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()); 688 return result; 689 } 690 ToInt32cvtt::ParallelMath691 static SInt32 ToInt32(const UInt16 &v) 692 { 693 SInt32 result; 694 result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()); 695 result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()); 696 return result; 697 } 698 ToInt32cvtt::ParallelMath699 static SInt32 ToInt32(const SInt16 &v) 700 { 701 SInt32 result; 702 result.m_values[0] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16); 703 result.m_values[1] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16); 704 return result; 705 } 706 ToFloatcvtt::ParallelMath707 static Float ToFloat(const SInt16 &v) 708 { 709 Float result; 710 result.m_values[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16)); 711 result.m_values[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16)); 712 return result; 713 } 714 ToFloatcvtt::ParallelMath715 static Float ToFloat(const UInt15 &v) 716 { 717 Float result; 718 result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128())); 719 result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128())); 720 return result; 721 } 722 ToFloatcvtt::ParallelMath723 static Float ToFloat(const UInt31 &v) 724 { 725 Float result; 726 result.m_values[0] = _mm_cvtepi32_ps(v.m_values[0]); 727 result.m_values[1] = _mm_cvtepi32_ps(v.m_values[1]); 728 return result; 729 } 730 FloatFlagToInt16cvtt::ParallelMath731 static Int16CompFlag FloatFlagToInt16(const FloatCompFlag &v) 732 { 733 __m128i lo = _mm_castps_si128(v.m_values[0]); 734 __m128i hi = _mm_castps_si128(v.m_values[1]); 735 736 Int16CompFlag result; 737 result.m_value = _mm_packs_epi32(lo, hi); 738 return result; 739 } 740 Int16FlagToFloatcvtt::ParallelMath741 static FloatCompFlag Int16FlagToFloat(const Int16CompFlag &v) 742 { 743 __m128i lo = _mm_unpacklo_epi16(v.m_value, v.m_value); 744 __m128i hi = _mm_unpackhi_epi16(v.m_value, v.m_value); 745 746 FloatCompFlag result; 747 result.m_values[0] = _mm_castsi128_ps(lo); 748 result.m_values[1] = _mm_castsi128_ps(hi); 749 return result; 750 } 751 MakeBoolInt16cvtt::ParallelMath752 static Int16CompFlag MakeBoolInt16(bool b) 753 { 754 Int16CompFlag result; 755 if (b) 756 result.m_value = _mm_set1_epi16(-1); 757 else 758 result.m_value = _mm_setzero_si128(); 759 return result; 760 } 761 MakeBoolFloatcvtt::ParallelMath762 static FloatCompFlag MakeBoolFloat(bool b) 763 { 764 FloatCompFlag result; 765 if (b) 766 result.m_values[0] = result.m_values[1] = _mm_castsi128_ps(_mm_set1_epi32(-1)); 767 else 768 result.m_values[0] = result.m_values[1] = _mm_setzero_ps(); 769 return result; 770 } 771 AndNotcvtt::ParallelMath772 static Int16CompFlag AndNot(const Int16CompFlag &a, const Int16CompFlag &b) 773 { 774 Int16CompFlag result; 775 result.m_value = _mm_andnot_si128(b.m_value, a.m_value); 776 return result; 777 } 778 RoundAndConvertToU16cvtt::ParallelMath779 static UInt16 RoundAndConvertToU16(const Float &v, const void* /*roundingMode*/) 780 { 781 __m128i lo = _mm_cvtps_epi32(_mm_add_ps(v.m_values[0], _mm_set1_ps(-32768))); 782 __m128i hi = _mm_cvtps_epi32(_mm_add_ps(v.m_values[1], _mm_set1_ps(-32768))); 783 784 __m128i packed = _mm_packs_epi32(lo, hi); 785 786 UInt16 result; 787 result.m_value = _mm_xor_si128(packed, _mm_set1_epi16(-32768)); 788 return result; 789 } 790 RoundAndConvertToU15cvtt::ParallelMath791 static UInt15 RoundAndConvertToU15(const Float &v, const void* /*roundingMode*/) 792 { 793 __m128i lo = _mm_cvtps_epi32(v.m_values[0]); 794 __m128i hi = _mm_cvtps_epi32(v.m_values[1]); 795 796 __m128i packed = _mm_packs_epi32(lo, hi); 797 798 UInt15 result; 799 result.m_value = _mm_packs_epi32(lo, hi); 800 return result; 801 } 802 RoundAndConvertToS16cvtt::ParallelMath803 static SInt16 RoundAndConvertToS16(const Float &v, const void* /*roundingMode*/) 804 { 805 __m128i lo = _mm_cvtps_epi32(v.m_values[0]); 806 __m128i hi = _mm_cvtps_epi32(v.m_values[1]); 807 808 __m128i packed = _mm_packs_epi32(lo, hi); 809 810 SInt16 result; 811 result.m_value = _mm_packs_epi32(lo, hi); 812 return result; 813 } 814 Sqrtcvtt::ParallelMath815 static Float Sqrt(const Float &f) 816 { 817 Float result; 818 for (int i = 0; i < 2; i++) 819 result.m_values[i] = _mm_sqrt_ps(f.m_values[i]); 820 return result; 821 } 822 Abscvtt::ParallelMath823 static UInt16 Abs(const SInt16 &a) 824 { 825 __m128i signBitsXor = _mm_srai_epi16(a.m_value, 15); 826 __m128i signBitsAdd = _mm_srli_epi16(a.m_value, 15); 827 828 UInt16 result; 829 result.m_value = _mm_add_epi16(_mm_xor_si128(a.m_value, signBitsXor), signBitsAdd); 830 return result; 831 } 832 Abscvtt::ParallelMath833 static Float Abs(const Float& a) 834 { 835 __m128 invMask = _mm_set1_ps(-0.0f); 836 837 Float result; 838 result.m_values[0] = _mm_andnot_ps(invMask, a.m_values[0]); 839 result.m_values[1] = _mm_andnot_ps(invMask, a.m_values[1]); 840 return result; 841 } 842 SqDiffUInt8cvtt::ParallelMath843 static UInt16 SqDiffUInt8(const UInt15 &a, const UInt15 &b) 844 { 845 __m128i diff = _mm_sub_epi16(a.m_value, b.m_value); 846 847 UInt16 result; 848 result.m_value = _mm_mullo_epi16(diff, diff); 849 return result; 850 } 851 SqDiffSInt16cvtt::ParallelMath852 static Float SqDiffSInt16(const SInt16 &a, const SInt16 &b) 853 { 854 __m128i diffU = _mm_sub_epi16(_mm_max_epi16(a.m_value, b.m_value), _mm_min_epi16(a.m_value, b.m_value)); 855 856 __m128i mulHi = _mm_mulhi_epu16(diffU, diffU); 857 __m128i mulLo = _mm_mullo_epi16(diffU, diffU); 858 __m128i sqDiffHi = _mm_unpackhi_epi16(mulLo, mulHi); 859 __m128i sqDiffLo = _mm_unpacklo_epi16(mulLo, mulHi); 860 861 Float result; 862 result.m_values[0] = _mm_cvtepi32_ps(sqDiffLo); 863 result.m_values[1] = _mm_cvtepi32_ps(sqDiffHi); 864 865 return result; 866 } 867 TwosCLHalfToFloatcvtt::ParallelMath868 static Float TwosCLHalfToFloat(const SInt16 &v) 869 { 870 __m128i absV = _mm_add_epi16(_mm_xor_si128(v.m_value, _mm_srai_epi16(v.m_value, 15)), _mm_srli_epi16(v.m_value, 15)); 871 872 __m128i signBits = _mm_and_si128(v.m_value, _mm_set1_epi16(-32768)); 873 __m128i mantissa = _mm_and_si128(v.m_value, _mm_set1_epi16(0x03ff)); 874 __m128i exponent = _mm_and_si128(v.m_value, _mm_set1_epi16(0x7c00)); 875 876 __m128i isDenormal = _mm_cmpeq_epi16(exponent, _mm_setzero_si128()); 877 878 // Convert exponent to high-bits 879 exponent = _mm_add_epi16(_mm_srli_epi16(exponent, 3), _mm_set1_epi16(14336)); 880 881 __m128i denormalCorrectionHigh = _mm_and_si128(isDenormal, _mm_or_si128(signBits, _mm_set1_epi16(14336))); 882 883 __m128i highBits = _mm_or_si128(signBits, _mm_or_si128(exponent, _mm_srli_epi16(mantissa, 3))); 884 __m128i lowBits = _mm_slli_epi16(mantissa, 13); 885 886 __m128i flow = _mm_unpacklo_epi16(lowBits, highBits); 887 __m128i fhigh = _mm_unpackhi_epi16(lowBits, highBits); 888 889 __m128i correctionLow = _mm_unpacklo_epi16(_mm_setzero_si128(), denormalCorrectionHigh); 890 __m128i correctionHigh = _mm_unpackhi_epi16(_mm_setzero_si128(), denormalCorrectionHigh); 891 892 Float result; 893 result.m_values[0] = _mm_sub_ps(_mm_castsi128_ps(flow), _mm_castsi128_ps(correctionLow)); 894 result.m_values[1] = _mm_sub_ps(_mm_castsi128_ps(fhigh), _mm_castsi128_ps(correctionHigh)); 895 896 return result; 897 } 898 SqDiff2CLFloatcvtt::ParallelMath899 static Float SqDiff2CLFloat(const SInt16 &a, const Float &b) 900 { 901 Float fa = TwosCLHalfToFloat(a); 902 903 Float diff = fa - b; 904 return diff * diff; 905 } 906 SqDiff2CLcvtt::ParallelMath907 static Float SqDiff2CL(const SInt16 &a, const SInt16 &b) 908 { 909 Float fa = TwosCLHalfToFloat(a); 910 Float fb = TwosCLHalfToFloat(b); 911 912 Float diff = fa - fb; 913 return diff * diff; 914 } 915 SqDiff2CLFloatcvtt::ParallelMath916 static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b) 917 { 918 Float fa = TwosCLHalfToFloat(a) * aWeight; 919 920 Float diff = fa - b; 921 return diff * diff; 922 } 923 RightShiftcvtt::ParallelMath924 static UInt16 RightShift(const UInt16 &v, int bits) 925 { 926 UInt16 result; 927 result.m_value = _mm_srli_epi16(v.m_value, bits); 928 return result; 929 } 930 RightShiftcvtt::ParallelMath931 static UInt31 RightShift(const UInt31 &v, int bits) 932 { 933 UInt31 result; 934 result.m_values[0] = _mm_srli_epi32(v.m_values[0], bits); 935 result.m_values[1] = _mm_srli_epi32(v.m_values[1], bits); 936 return result; 937 } 938 RightShiftcvtt::ParallelMath939 static SInt16 RightShift(const SInt16 &v, int bits) 940 { 941 SInt16 result; 942 result.m_value = _mm_srai_epi16(v.m_value, bits); 943 return result; 944 } 945 RightShiftcvtt::ParallelMath946 static UInt15 RightShift(const UInt15 &v, int bits) 947 { 948 UInt15 result; 949 result.m_value = _mm_srli_epi16(v.m_value, bits); 950 return result; 951 } 952 RightShiftcvtt::ParallelMath953 static SInt32 RightShift(const SInt32 &v, int bits) 954 { 955 SInt32 result; 956 result.m_values[0] = _mm_srai_epi32(v.m_values[0], bits); 957 result.m_values[1] = _mm_srai_epi32(v.m_values[1], bits); 958 return result; 959 } 960 ToSInt16cvtt::ParallelMath961 static SInt16 ToSInt16(const SInt32 &v) 962 { 963 SInt16 result; 964 result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]); 965 return result; 966 } 967 ToUInt16cvtt::ParallelMath968 static UInt16 ToUInt16(const UInt32 &v) 969 { 970 __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16); 971 __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16); 972 973 UInt16 result; 974 result.m_value = _mm_packs_epi32(low, high); 975 return result; 976 } 977 ToUInt16cvtt::ParallelMath978 static UInt16 ToUInt16(const UInt31 &v) 979 { 980 __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16); 981 __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16); 982 983 UInt16 result; 984 result.m_value = _mm_packs_epi32(low, high); 985 return result; 986 } 987 ToUInt15cvtt::ParallelMath988 static UInt15 ToUInt15(const UInt31 &v) 989 { 990 UInt15 result; 991 result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]); 992 return result; 993 } 994 XMultiplycvtt::ParallelMath995 static SInt32 XMultiply(const SInt16 &a, const SInt16 &b) 996 { 997 __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value); 998 __m128i low = _mm_mullo_epi16(a.m_value, b.m_value); 999 1000 SInt32 result; 1001 result.m_values[0] = _mm_unpacklo_epi16(low, high); 1002 result.m_values[1] = _mm_unpackhi_epi16(low, high); 1003 return result; 1004 } 1005 XMultiplycvtt::ParallelMath1006 static SInt32 XMultiply(const SInt16 &a, const UInt15 &b) 1007 { 1008 __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value); 1009 __m128i low = _mm_mullo_epi16(a.m_value, b.m_value); 1010 1011 SInt32 result; 1012 result.m_values[0] = _mm_unpacklo_epi16(low, high); 1013 result.m_values[1] = _mm_unpackhi_epi16(low, high); 1014 return result; 1015 } 1016 XMultiplycvtt::ParallelMath1017 static SInt32 XMultiply(const UInt15 &a, const SInt16 &b) 1018 { 1019 return XMultiply(b, a); 1020 } 1021 XMultiplycvtt::ParallelMath1022 static UInt32 XMultiply(const UInt16 &a, const UInt16 &b) 1023 { 1024 __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value); 1025 __m128i low = _mm_mullo_epi16(a.m_value, b.m_value); 1026 1027 UInt32 result; 1028 result.m_values[0] = _mm_unpacklo_epi16(low, high); 1029 result.m_values[1] = _mm_unpackhi_epi16(low, high); 1030 return result; 1031 } 1032 CompactMultiplycvtt::ParallelMath1033 static UInt16 CompactMultiply(const UInt16 &a, const UInt15 &b) 1034 { 1035 UInt16 result; 1036 result.m_value = _mm_mullo_epi16(a.m_value, b.m_value); 1037 return result; 1038 } 1039 CompactMultiplycvtt::ParallelMath1040 static UInt16 CompactMultiply(const UInt15 &a, const UInt15 &b) 1041 { 1042 UInt16 result; 1043 result.m_value = _mm_mullo_epi16(a.m_value, b.m_value); 1044 return result; 1045 } 1046 XMultiplycvtt::ParallelMath1047 static UInt31 XMultiply(const UInt15 &a, const UInt15 &b) 1048 { 1049 __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value); 1050 __m128i low = _mm_mullo_epi16(a.m_value, b.m_value); 1051 1052 UInt31 result; 1053 result.m_values[0] = _mm_unpacklo_epi16(low, high); 1054 result.m_values[1] = _mm_unpackhi_epi16(low, high); 1055 return result; 1056 } 1057 XMultiplycvtt::ParallelMath1058 static UInt31 XMultiply(const UInt16 &a, const UInt15 &b) 1059 { 1060 __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value); 1061 __m128i low = _mm_mullo_epi16(a.m_value, b.m_value); 1062 1063 UInt31 result; 1064 result.m_values[0] = _mm_unpacklo_epi16(low, high); 1065 result.m_values[1] = _mm_unpackhi_epi16(low, high); 1066 return result; 1067 } 1068 XMultiplycvtt::ParallelMath1069 static UInt31 XMultiply(const UInt15 &a, const UInt16 &b) 1070 { 1071 return XMultiply(b, a); 1072 } 1073 AnySetcvtt::ParallelMath1074 static bool AnySet(const Int16CompFlag &v) 1075 { 1076 return _mm_movemask_epi8(v.m_value) != 0; 1077 } 1078 AllSetcvtt::ParallelMath1079 static bool AllSet(const Int16CompFlag &v) 1080 { 1081 return _mm_movemask_epi8(v.m_value) == 0xffff; 1082 } 1083 AnySetcvtt::ParallelMath1084 static bool AnySet(const FloatCompFlag &v) 1085 { 1086 return _mm_movemask_ps(v.m_values[0]) != 0 || _mm_movemask_ps(v.m_values[1]) != 0; 1087 } 1088 AllSetcvtt::ParallelMath1089 static bool AllSet(const FloatCompFlag &v) 1090 { 1091 return _mm_movemask_ps(v.m_values[0]) == 0xf && _mm_movemask_ps(v.m_values[1]) == 0xf; 1092 } 1093 }; 1094 1095 #else 1096 // Scalar version 1097 struct ParallelMath 1098 { 1099 struct RoundTowardZeroForScope 1100 { 1101 }; 1102 1103 struct RoundTowardNearestForScope 1104 { 1105 }; 1106 1107 struct RoundUpForScope 1108 { 1109 }; 1110 1111 struct RoundDownForScope 1112 { 1113 }; 1114 1115 static const int ParallelSize = 1; 1116 1117 enum Int16Subtype 1118 { 1119 IntSubtype_Signed, 1120 IntSubtype_UnsignedFull, 1121 IntSubtype_UnsignedTruncated, 1122 IntSubtype_Abstract, 1123 }; 1124 1125 typedef int32_t SInt16; 1126 typedef int32_t UInt15; 1127 typedef int32_t UInt16; 1128 typedef int32_t AInt16; 1129 1130 typedef int32_t SInt32; 1131 typedef int32_t UInt31; 1132 typedef int32_t UInt32; 1133 typedef int32_t AInt32; 1134 1135 typedef int32_t ScalarUInt16; 1136 typedef int32_t ScalarSInt16; 1137 1138 typedef float Float; 1139 1140 template<class TTargetType> 1141 struct LosslessCast 1142 { 1143 static const int32_t& Cast(const int32_t &src) 1144 { 1145 return src; 1146 } 1147 }; 1148 1149 typedef bool Int16CompFlag; 1150 typedef bool FloatCompFlag; 1151 1152 static int32_t AbstractAdd(const int32_t &a, const int32_t &b) 1153 { 1154 return a + b; 1155 } 1156 1157 static int32_t AbstractSubtract(const int32_t &a, const int32_t &b) 1158 { 1159 return a - b; 1160 } 1161 1162 static float Select(bool flag, float a, float b) 1163 { 1164 return flag ? a : b; 1165 } 1166 1167 static int32_t Select(bool flag, int32_t a, int32_t b) 1168 { 1169 return flag ? a : b; 1170 } 1171 1172 static int32_t SelectOrZero(bool flag, int32_t a) 1173 { 1174 return flag ? a : 0; 1175 } 1176 1177 static void ConditionalSet(int32_t& dest, bool flag, int32_t src) 1178 { 1179 if (flag) 1180 dest = src; 1181 } 1182 1183 static int32_t ConditionalNegate(bool flag, int32_t v) 1184 { 1185 return (flag) ? -v : v; 1186 } 1187 1188 static void NotConditionalSet(int32_t& dest, bool flag, int32_t src) 1189 { 1190 if (!flag) 1191 dest = src; 1192 } 1193 1194 static void ConditionalSet(float& dest, bool flag, float src) 1195 { 1196 if (flag) 1197 dest = src; 1198 } 1199 1200 static void NotConditionalSet(float& dest, bool flag, float src) 1201 { 1202 if (!flag) 1203 dest = src; 1204 } 1205 1206 static void MakeSafeDenominator(float& v) 1207 { 1208 if (v == 0.0f) 1209 v = 1.0f; 1210 } 1211 1212 static int32_t SignedRightShift(int32_t v, int bits) 1213 { 1214 return v >> bits; 1215 } 1216 1217 static int32_t TruncateToPrecisionSigned(int32_t v, int precision) 1218 { 1219 v = (v << (32 - precision)) & 0xffffffff; 1220 return SignedRightShift(v, 32 - precision); 1221 } 1222 1223 static int32_t TruncateToPrecisionUnsigned(int32_t v, int precision) 1224 { 1225 return v & ((1 << precision) - 1); 1226 } 1227 1228 static int32_t Min(int32_t a, int32_t b) 1229 { 1230 if (a < b) 1231 return a; 1232 return b; 1233 } 1234 1235 static float Min(float a, float b) 1236 { 1237 if (a < b) 1238 return a; 1239 return b; 1240 } 1241 1242 static int32_t Max(int32_t a, int32_t b) 1243 { 1244 if (a > b) 1245 return a; 1246 return b; 1247 } 1248 1249 static float Max(float a, float b) 1250 { 1251 if (a > b) 1252 return a; 1253 return b; 1254 } 1255 1256 static float Abs(float a) 1257 { 1258 return fabsf(a); 1259 } 1260 1261 static int32_t Abs(int32_t a) 1262 { 1263 if (a < 0) 1264 return -a; 1265 return a; 1266 } 1267 1268 static float Clamp(float v, float min, float max) 1269 { 1270 if (v < min) 1271 return min; 1272 if (v > max) 1273 return max; 1274 return v; 1275 } 1276 1277 static float Reciprocal(float v) 1278 { 1279 return 1.0f / v; 1280 } 1281 1282 static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut) 1283 { 1284 chOut = inputBlocks[0].m_pixels[pxOffset][channel]; 1285 } 1286 1287 static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut) 1288 { 1289 chOut = inputBlocks[0].m_pixels[pxOffset][channel]; 1290 } 1291 1292 static float MakeFloat(float v) 1293 { 1294 return v; 1295 } 1296 1297 static float MakeFloatZero() 1298 { 1299 return 0.0f; 1300 } 1301 1302 static int32_t MakeUInt16(uint16_t v) 1303 { 1304 return v; 1305 } 1306 1307 static int32_t MakeSInt16(int16_t v) 1308 { 1309 return v; 1310 } 1311 1312 static int32_t MakeAInt16(int16_t v) 1313 { 1314 return v; 1315 } 1316 1317 static int32_t MakeUInt15(uint16_t v) 1318 { 1319 return v; 1320 } 1321 1322 static int32_t MakeSInt32(int32_t v) 1323 { 1324 return v; 1325 } 1326 1327 static int32_t MakeUInt31(int32_t v) 1328 { 1329 return v; 1330 } 1331 1332 static int32_t Extract(int32_t v, int offset) 1333 { 1334 UNREFERENCED_PARAMETER(offset); 1335 return v; 1336 } 1337 1338 static void PutUInt16(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v) 1339 { 1340 UNREFERENCED_PARAMETER(offset); 1341 dest = v; 1342 } 1343 1344 static void PutUInt15(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v) 1345 { 1346 UNREFERENCED_PARAMETER(offset); 1347 dest = v; 1348 } 1349 1350 static void PutSInt16(int32_t &dest, int offset, ParallelMath::ScalarSInt16 v) 1351 { 1352 UNREFERENCED_PARAMETER(offset); 1353 dest = v; 1354 } 1355 1356 static float ExtractFloat(float v, int offset) 1357 { 1358 UNREFERENCED_PARAMETER(offset); 1359 return v; 1360 } 1361 1362 static void PutFloat(float &dest, int offset, float v) 1363 { 1364 UNREFERENCED_PARAMETER(offset); 1365 dest = v; 1366 } 1367 1368 static bool Less(int32_t a, int32_t b) 1369 { 1370 return a < b; 1371 } 1372 1373 static bool Less(float a, float b) 1374 { 1375 return a < b; 1376 } 1377 1378 static bool LessOrEqual(int32_t a, int32_t b) 1379 { 1380 return a < b; 1381 } 1382 1383 static bool LessOrEqual(float a, float b) 1384 { 1385 return a < b; 1386 } 1387 1388 static bool Equal(int32_t a, int32_t b) 1389 { 1390 return a == b; 1391 } 1392 1393 static bool Equal(float a, float b) 1394 { 1395 return a == b; 1396 } 1397 1398 static float ToFloat(int32_t v) 1399 { 1400 return static_cast<float>(v); 1401 } 1402 1403 static int32_t ToUInt31(int32_t v) 1404 { 1405 return v; 1406 } 1407 1408 static int32_t ToInt32(int32_t v) 1409 { 1410 return v; 1411 } 1412 1413 static bool FloatFlagToInt16(bool v) 1414 { 1415 return v; 1416 } 1417 1418 static bool Int16FlagToFloat(bool v) 1419 { 1420 return v; 1421 } 1422 1423 static bool MakeBoolInt16(bool b) 1424 { 1425 return b; 1426 } 1427 1428 static bool MakeBoolFloat(bool b) 1429 { 1430 return b; 1431 } 1432 1433 static bool AndNot(bool a, bool b) 1434 { 1435 return a && !b; 1436 } 1437 1438 static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardZeroForScope *rtz) 1439 { 1440 UNREFERENCED_PARAMETER(rtz); 1441 return static_cast<int>(v); 1442 } 1443 1444 static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundUpForScope *ru) 1445 { 1446 UNREFERENCED_PARAMETER(ru); 1447 return static_cast<int>(ceilf(v)); 1448 } 1449 1450 static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundDownForScope *rd) 1451 { 1452 UNREFERENCED_PARAMETER(rd); 1453 return static_cast<int>(floorf(v)); 1454 } 1455 1456 static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardNearestForScope *rtn) 1457 { 1458 UNREFERENCED_PARAMETER(rtn); 1459 return static_cast<int>(floorf(v + 0.5f)); 1460 } 1461 1462 template<class TRoundMode> 1463 static int32_t RoundAndConvertToU16(float v, const TRoundMode *roundingMode) 1464 { 1465 return RoundAndConvertToInt(v, roundingMode); 1466 } 1467 1468 template<class TRoundMode> 1469 static int32_t RoundAndConvertToU15(float v, const TRoundMode *roundingMode) 1470 { 1471 return RoundAndConvertToInt(v, roundingMode); 1472 } 1473 1474 template<class TRoundMode> 1475 static int32_t RoundAndConvertToS16(float v, const TRoundMode *roundingMode) 1476 { 1477 return RoundAndConvertToInt(v, roundingMode); 1478 } 1479 1480 static float Sqrt(float f) 1481 { 1482 return sqrtf(f); 1483 } 1484 1485 static int32_t SqDiffUInt8(int32_t a, int32_t b) 1486 { 1487 int32_t delta = a - b; 1488 return delta * delta; 1489 } 1490 1491 static int32_t SqDiffInt16(int32_t a, int32_t b) 1492 { 1493 int32_t delta = a - b; 1494 return delta * delta; 1495 } 1496 1497 static int32_t SqDiffSInt16(int32_t a, int32_t b) 1498 { 1499 int32_t delta = a - b; 1500 return delta * delta; 1501 } 1502 1503 static float TwosCLHalfToFloat(int32_t v) 1504 { 1505 int32_t absV = (v < 0) ? -v : v; 1506 1507 int32_t signBits = (absV & -32768); 1508 int32_t mantissa = (absV & 0x03ff); 1509 int32_t exponent = (absV & 0x7c00); 1510 1511 bool isDenormal = (exponent == 0); 1512 1513 // Convert exponent to high-bits 1514 exponent = (exponent >> 3) + 14336; 1515 1516 int32_t denormalCorrection = (isDenormal ? (signBits | 14336) : 0) << 16; 1517 1518 int32_t fBits = ((exponent | signBits) << 16) | (mantissa << 13); 1519 1520 float f, correction; 1521 memcpy(&f, &fBits, 4); 1522 memcpy(&correction, &denormalCorrection, 4); 1523 1524 return f - correction; 1525 } 1526 1527 static Float SqDiff2CLFloat(const SInt16 &a, const Float &b) 1528 { 1529 Float fa = TwosCLHalfToFloat(a); 1530 1531 Float diff = fa - b; 1532 return diff * diff; 1533 } 1534 1535 static Float SqDiff2CL(const SInt16 &a, const SInt16 &b) 1536 { 1537 Float fa = TwosCLHalfToFloat(a); 1538 Float fb = TwosCLHalfToFloat(b); 1539 1540 Float diff = fa - fb; 1541 return diff * diff; 1542 } 1543 1544 static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b) 1545 { 1546 Float fa = TwosCLHalfToFloat(a) * aWeight; 1547 1548 Float diff = fa - b; 1549 return diff * diff; 1550 } 1551 1552 static int32_t RightShift(int32_t v, int bits) 1553 { 1554 return SignedRightShift(v, bits); 1555 } 1556 1557 static int32_t ToSInt16(int32_t v) 1558 { 1559 return v; 1560 } 1561 1562 static int32_t ToUInt16(int32_t v) 1563 { 1564 return v; 1565 } 1566 1567 static int32_t ToUInt15(int32_t v) 1568 { 1569 return v; 1570 } 1571 1572 static int32_t XMultiply(int32_t a, int32_t b) 1573 { 1574 return a * b; 1575 } 1576 1577 static int32_t CompactMultiply(int32_t a, int32_t b) 1578 { 1579 return a * b; 1580 } 1581 1582 static bool AnySet(bool v) 1583 { 1584 return v; 1585 } 1586 1587 static bool AllSet(bool v) 1588 { 1589 return v; 1590 } 1591 }; 1592 1593 #endif 1594 1595 namespace Internal 1596 { 1597 namespace BC7Data 1598 { 1599 enum AlphaMode 1600 { 1601 AlphaMode_Combined, 1602 AlphaMode_Separate, 1603 AlphaMode_None, 1604 }; 1605 1606 enum PBitMode 1607 { 1608 PBitMode_PerEndpoint, 1609 PBitMode_PerSubset, 1610 PBitMode_None 1611 }; 1612 1613 struct BC7ModeInfo 1614 { 1615 PBitMode m_pBitMode; 1616 AlphaMode m_alphaMode; 1617 int m_rgbBits; 1618 int m_alphaBits; 1619 int m_partitionBits; 1620 int m_numSubsets; 1621 int m_indexBits; 1622 int m_alphaIndexBits; 1623 bool m_hasIndexSelector; 1624 }; 1625 1626 BC7ModeInfo g_modes[] = 1627 { 1628 { PBitMode_PerEndpoint, AlphaMode_None, 4, 0, 4, 3, 3, 0, false }, // 0 1629 { PBitMode_PerSubset, AlphaMode_None, 6, 0, 6, 2, 3, 0, false }, // 1 1630 { PBitMode_None, AlphaMode_None, 5, 0, 6, 3, 2, 0, false }, // 2 1631 { PBitMode_PerEndpoint, AlphaMode_None, 7, 0, 6, 2, 2, 0, false }, // 3 (Mode reference has an error, P-bit is really per-endpoint) 1632 1633 { PBitMode_None, AlphaMode_Separate, 5, 6, 0, 1, 2, 3, true }, // 4 1634 { PBitMode_None, AlphaMode_Separate, 7, 8, 0, 1, 2, 2, false }, // 5 1635 { PBitMode_PerEndpoint, AlphaMode_Combined, 7, 7, 0, 1, 4, 0, false }, // 6 1636 { PBitMode_PerEndpoint, AlphaMode_Combined, 5, 5, 6, 2, 2, 0, false } // 7 1637 }; 1638 1639 const int g_weight2[] = { 0, 21, 43, 64 }; 1640 const int g_weight3[] = { 0, 9, 18, 27, 37, 46, 55, 64 }; 1641 const int g_weight4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }; 1642 1643 const int *g_weightTables[] = 1644 { 1645 NULL, 1646 NULL, 1647 g_weight2, 1648 g_weight3, 1649 g_weight4 1650 }; 1651 1652 struct BC6HModeInfo 1653 { 1654 uint16_t m_modeID; 1655 bool m_partitioned; 1656 bool m_transformed; 1657 int m_aPrec; 1658 int m_bPrec[3]; 1659 }; 1660 1661 // [partitioned][precision] 1662 bool g_hdrModesExistForPrecision[2][17] = 1663 { 1664 //0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 1665 { false, false, false, false, false, false, false, false, false, false, true, true, true, false, false, false, true }, 1666 { false, false, false, false, false, false, true, true, true, true, true, true, false, false, false, false, false }, 1667 }; 1668 1669 BC6HModeInfo g_hdrModes[] = 1670 { 1671 { 0x00, true, true, 10,{ 5, 5, 5 } }, 1672 { 0x01, true, true, 7,{ 6, 6, 6 } }, 1673 { 0x02, true, true, 11,{ 5, 4, 4 } }, 1674 { 0x06, true, true, 11,{ 4, 5, 4 } }, 1675 { 0x0a, true, true, 11,{ 4, 4, 5 } }, 1676 { 0x0e, true, true, 9,{ 5, 5, 5 } }, 1677 { 0x12, true, true, 8,{ 6, 5, 5 } }, 1678 { 0x16, true, true, 8,{ 5, 6, 5 } }, 1679 { 0x1a, true, true, 8,{ 5, 5, 6 } }, 1680 { 0x1e, true, false, 6,{ 6, 6, 6 } }, 1681 { 0x03, false, false, 10,{ 10, 10, 10 } }, 1682 { 0x07, false, true, 11,{ 9, 9, 9 } }, 1683 { 0x0b, false, true, 12,{ 8, 8, 8 } }, 1684 { 0x0f, false, true, 16,{ 4, 4, 4 } }, 1685 }; 1686 1687 const int g_maxHDRPrecision = 16; 1688 1689 static const size_t g_numHDRModes = sizeof(g_hdrModes) / sizeof(g_hdrModes[0]); 1690 1691 static uint16_t g_partitionMap[64] = 1692 { 1693 0xCCCC, 0x8888, 0xEEEE, 0xECC8, 1694 0xC880, 0xFEEC, 0xFEC8, 0xEC80, 1695 0xC800, 0xFFEC, 0xFE80, 0xE800, 1696 0xFFE8, 0xFF00, 0xFFF0, 0xF000, 1697 0xF710, 0x008E, 0x7100, 0x08CE, 1698 0x008C, 0x7310, 0x3100, 0x8CCE, 1699 0x088C, 0x3110, 0x6666, 0x366C, 1700 0x17E8, 0x0FF0, 0x718E, 0x399C, 1701 0xaaaa, 0xf0f0, 0x5a5a, 0x33cc, 1702 0x3c3c, 0x55aa, 0x9696, 0xa55a, 1703 0x73ce, 0x13c8, 0x324c, 0x3bdc, 1704 0x6996, 0xc33c, 0x9966, 0x660, 1705 0x272, 0x4e4, 0x4e40, 0x2720, 1706 0xc936, 0x936c, 0x39c6, 0x639c, 1707 0x9336, 0x9cc6, 0x817e, 0xe718, 1708 0xccf0, 0xfcc, 0x7744, 0xee22, 1709 }; 1710 1711 static uint32_t g_partitionMap2[64] = 1712 { 1713 0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8, 1714 0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050, 1715 0xaa550000, 0xaa555500, 0xaaaa5500, 0x90909090, 1716 0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250, 1717 0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0, 1718 0xa8a85454, 0x6a6a4040, 0xa4a45000, 0x1a1a0500, 1719 0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400, 1720 0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200, 1721 0xa9a58000, 0x5090a0a8, 0xa8a09050, 0x24242424, 1722 0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50, 1723 0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0, 1724 0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600, 1725 0xaa444444, 0x54a854a8, 0x95809580, 0x96969600, 1726 0xa85454a8, 0x80959580, 0xaa141414, 0x96960000, 1727 0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000, 1728 0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254, 1729 }; 1730 1731 static int g_fixupIndexes2[64] = 1732 { 1733 15,15,15,15, 1734 15,15,15,15, 1735 15,15,15,15, 1736 15,15,15,15, 1737 15, 2, 8, 2, 1738 2, 8, 8,15, 1739 2, 8, 2, 2, 1740 8, 8, 2, 2, 1741 1742 15,15, 6, 8, 1743 2, 8,15,15, 1744 2, 8, 2, 2, 1745 2,15,15, 6, 1746 6, 2, 6, 8, 1747 15,15, 2, 2, 1748 15,15,15,15, 1749 15, 2, 2,15, 1750 }; 1751 1752 static int g_fixupIndexes3[64][2] = 1753 { 1754 { 3,15 },{ 3, 8 },{ 15, 8 },{ 15, 3 }, 1755 { 8,15 },{ 3,15 },{ 15, 3 },{ 15, 8 }, 1756 { 8,15 },{ 8,15 },{ 6,15 },{ 6,15 }, 1757 { 6,15 },{ 5,15 },{ 3,15 },{ 3, 8 }, 1758 { 3,15 },{ 3, 8 },{ 8,15 },{ 15, 3 }, 1759 { 3,15 },{ 3, 8 },{ 6,15 },{ 10, 8 }, 1760 { 5, 3 },{ 8,15 },{ 8, 6 },{ 6,10 }, 1761 { 8,15 },{ 5,15 },{ 15,10 },{ 15, 8 }, 1762 1763 { 8,15 },{ 15, 3 },{ 3,15 },{ 5,10 }, 1764 { 6,10 },{ 10, 8 },{ 8, 9 },{ 15,10 }, 1765 { 15, 6 },{ 3,15 },{ 15, 8 },{ 5,15 }, 1766 { 15, 3 },{ 15, 6 },{ 15, 6 },{ 15, 8 }, 1767 { 3,15 },{ 15, 3 },{ 5,15 },{ 5,15 }, 1768 { 5,15 },{ 8,15 },{ 5,15 },{ 10,15 }, 1769 { 5,15 },{ 10,15 },{ 8,15 },{ 13,15 }, 1770 { 15, 3 },{ 12,15 },{ 3,15 },{ 3, 8 }, 1771 }; 1772 1773 static const unsigned char g_fragments[] = 1774 { 1775 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 0, 16 1776 0, 1, 2, 3, // 16, 4 1777 0, 1, 4, // 20, 3 1778 0, 1, 2, 4, // 23, 4 1779 2, 3, 7, // 27, 3 1780 1, 2, 3, 7, // 30, 4 1781 0, 1, 2, 3, 4, 5, 6, 7, // 34, 8 1782 0, 1, 4, 8, // 42, 4 1783 0, 1, 2, 4, 5, 8, // 46, 6 1784 0, 1, 2, 3, 4, 5, 6, 8, // 52, 8 1785 1, 4, 5, 6, 9, // 60, 5 1786 2, 5, 6, 7, 10, // 65, 5 1787 5, 6, 9, 10, // 70, 4 1788 2, 3, 7, 11, // 74, 4 1789 1, 2, 3, 6, 7, 11, // 78, 6 1790 0, 1, 2, 3, 5, 6, 7, 11, // 84, 8 1791 0, 1, 2, 3, 8, 9, 10, 11, // 92, 8 1792 2, 3, 6, 7, 8, 9, 10, 11, // 100, 8 1793 4, 5, 6, 7, 8, 9, 10, 11, // 108, 8 1794 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // 116, 12 1795 0, 4, 8, 12, // 128, 4 1796 0, 2, 3, 4, 6, 7, 8, 12, // 132, 8 1797 0, 1, 2, 4, 5, 8, 9, 12, // 140, 8 1798 0, 1, 2, 3, 4, 5, 6, 8, 9, 12, // 148, 10 1799 3, 6, 7, 8, 9, 12, // 158, 6 1800 3, 5, 6, 7, 8, 9, 10, 12, // 164, 8 1801 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, // 172, 12 1802 0, 1, 2, 5, 6, 7, 11, 12, // 184, 8 1803 5, 8, 9, 10, 13, // 192, 5 1804 8, 12, 13, // 197, 3 1805 4, 8, 12, 13, // 200, 4 1806 2, 3, 6, 9, 12, 13, // 204, 6 1807 0, 1, 2, 3, 8, 9, 12, 13, // 210, 8 1808 0, 1, 4, 5, 8, 9, 12, 13, // 218, 8 1809 2, 3, 6, 7, 8, 9, 12, 13, // 226, 8 1810 2, 3, 5, 6, 9, 10, 12, 13, // 234, 8 1811 0, 3, 6, 7, 9, 10, 12, 13, // 242, 8 1812 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, // 250, 12 1813 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, // 262, 13 1814 2, 3, 4, 7, 8, 11, 12, 13, // 275, 8 1815 1, 2, 6, 7, 8, 11, 12, 13, // 283, 8 1816 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, // 291, 10 1817 2, 3, 4, 5, 10, 11, 12, 13, // 301, 8 1818 0, 1, 6, 7, 10, 11, 12, 13, // 309, 8 1819 6, 9, 10, 11, 14, // 317, 5 1820 0, 2, 4, 6, 8, 10, 12, 14, // 322, 8 1821 1, 3, 5, 7, 8, 10, 12, 14, // 330, 8 1822 1, 3, 4, 6, 9, 11, 12, 14, // 338, 8 1823 0, 2, 5, 7, 9, 11, 12, 14, // 346, 8 1824 0, 3, 4, 5, 8, 9, 13, 14, // 354, 8 1825 2, 3, 4, 7, 8, 9, 13, 14, // 362, 8 1826 1, 2, 5, 6, 9, 10, 13, 14, // 370, 8 1827 0, 3, 4, 7, 9, 10, 13, 14, // 378, 8 1828 0, 3, 5, 6, 8, 11, 13, 14, // 386, 8 1829 1, 2, 4, 7, 8, 11, 13, 14, // 394, 8 1830 0, 1, 4, 7, 10, 11, 13, 14, // 402, 8 1831 0, 3, 6, 7, 10, 11, 13, 14, // 410, 8 1832 8, 12, 13, 14, // 418, 4 1833 1, 2, 3, 7, 8, 12, 13, 14, // 422, 8 1834 4, 8, 9, 12, 13, 14, // 430, 6 1835 0, 4, 5, 8, 9, 12, 13, 14, // 436, 8 1836 1, 2, 3, 6, 7, 8, 9, 12, 13, 14, // 444, 10 1837 2, 6, 8, 9, 10, 12, 13, 14, // 454, 8 1838 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, // 462, 12 1839 0, 7, 9, 10, 11, 12, 13, 14, // 474, 8 1840 1, 2, 3, 4, 5, 6, 8, 15, // 482, 8 1841 3, 7, 11, 15, // 490, 4 1842 0, 1, 3, 4, 5, 7, 11, 15, // 494, 8 1843 0, 4, 5, 10, 11, 15, // 502, 6 1844 1, 2, 3, 6, 7, 10, 11, 15, // 508, 8 1845 0, 1, 2, 3, 5, 6, 7, 10, 11, 15, // 516, 10 1846 0, 4, 5, 6, 9, 10, 11, 15, // 526, 8 1847 0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 15, // 534, 12 1848 1, 2, 4, 5, 8, 9, 12, 15, // 546, 8 1849 2, 3, 5, 6, 8, 9, 12, 15, // 554, 8 1850 0, 3, 5, 6, 9, 10, 12, 15, // 562, 8 1851 1, 2, 4, 7, 9, 10, 12, 15, // 570, 8 1852 1, 2, 5, 6, 8, 11, 12, 15, // 578, 8 1853 0, 3, 4, 7, 8, 11, 12, 15, // 586, 8 1854 0, 1, 5, 6, 10, 11, 12, 15, // 594, 8 1855 1, 2, 6, 7, 10, 11, 12, 15, // 602, 8 1856 1, 3, 4, 6, 8, 10, 13, 15, // 610, 8 1857 0, 2, 5, 7, 8, 10, 13, 15, // 618, 8 1858 0, 2, 4, 6, 9, 11, 13, 15, // 626, 8 1859 1, 3, 5, 7, 9, 11, 13, 15, // 634, 8 1860 0, 1, 2, 3, 4, 5, 7, 8, 12, 13, 15, // 642, 11 1861 2, 3, 4, 5, 8, 9, 14, 15, // 653, 8 1862 0, 1, 6, 7, 8, 9, 14, 15, // 661, 8 1863 0, 1, 5, 10, 14, 15, // 669, 6 1864 0, 3, 4, 5, 9, 10, 14, 15, // 675, 8 1865 0, 1, 5, 6, 9, 10, 14, 15, // 683, 8 1866 11, 14, 15, // 691, 3 1867 7, 11, 14, 15, // 694, 4 1868 1, 2, 4, 5, 8, 11, 14, 15, // 698, 8 1869 0, 1, 4, 7, 8, 11, 14, 15, // 706, 8 1870 0, 1, 4, 5, 10, 11, 14, 15, // 714, 8 1871 2, 3, 6, 7, 10, 11, 14, 15, // 722, 8 1872 4, 5, 6, 7, 10, 11, 14, 15, // 730, 8 1873 0, 1, 4, 5, 7, 8, 10, 11, 14, 15, // 738, 10 1874 0, 1, 2, 3, 5, 6, 7, 9, 10, 11, 14, 15, // 748, 12 1875 0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 14, 15, // 760, 13 1876 0, 1, 2, 3, 4, 6, 7, 11, 12, 14, 15, // 773, 11 1877 3, 4, 8, 9, 10, 13, 14, 15, // 784, 8 1878 11, 13, 14, 15, // 792, 4 1879 0, 1, 2, 4, 11, 13, 14, 15, // 796, 8 1880 0, 1, 2, 4, 5, 10, 11, 13, 14, 15, // 804, 10 1881 7, 10, 11, 13, 14, 15, // 814, 6 1882 3, 6, 7, 10, 11, 13, 14, 15, // 820, 8 1883 1, 5, 9, 10, 11, 13, 14, 15, // 828, 8 1884 1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15, // 836, 12 1885 12, 13, 14, 15, // 848, 4 1886 0, 1, 2, 3, 12, 13, 14, 15, // 852, 8 1887 0, 1, 4, 5, 12, 13, 14, 15, // 860, 8 1888 4, 5, 6, 7, 12, 13, 14, 15, // 868, 8 1889 4, 8, 9, 10, 12, 13, 14, 15, // 876, 8 1890 0, 4, 5, 8, 9, 10, 12, 13, 14, 15, // 884, 10 1891 0, 1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, // 894, 12 1892 0, 1, 2, 3, 4, 7, 8, 11, 12, 13, 14, 15, // 906, 12 1893 0, 1, 3, 4, 8, 9, 11, 12, 13, 14, 15, // 918, 11 1894 0, 2, 3, 7, 8, 10, 11, 12, 13, 14, 15, // 929, 11 1895 7, 9, 10, 11, 12, 13, 14, 15, // 940, 8 1896 3, 6, 7, 9, 10, 11, 12, 13, 14, 15, // 948, 10 1897 2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, // 958, 12 1898 8, 9, 10, 11, 12, 13, 14, 15, // 970, 8 1899 0, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, // 978, 12 1900 0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, // 990, 13 1901 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 1003, 12 1902 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 1015, 13 1903 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 1028, 12 1904 0, 2, // 1040, 2 1905 1, 3, // 1042, 2 1906 0, 1, 4, 5, // 1044, 4 1907 0, 1, 2, 4, 5, // 1048, 5 1908 2, 3, 6, // 1053, 3 1909 0, 2, 4, 6, // 1056, 4 1910 1, 2, 5, 6, // 1060, 4 1911 0, 1, 2, 3, 5, 6, // 1064, 6 1912 0, 1, 2, 4, 5, 6, // 1070, 6 1913 0, 1, 2, 3, 4, 5, 6, // 1076, 7 1914 0, 3, 4, 7, // 1083, 4 1915 0, 1, 2, 3, 4, 7, // 1087, 6 1916 1, 3, 5, 7, // 1093, 4 1917 2, 3, 6, 7, // 1097, 4 1918 1, 2, 3, 6, 7, // 1101, 5 1919 1, 2, 3, 5, 6, 7, // 1106, 6 1920 0, 1, 2, 3, 5, 6, 7, // 1112, 7 1921 4, 5, 6, 7, // 1119, 4 1922 0, 8, // 1123, 2 1923 0, 1, 4, 5, 8, // 1125, 5 1924 0, 1, 8, 9, // 1130, 4 1925 4, 5, 8, 9, // 1134, 4 1926 0, 1, 4, 5, 8, 9, // 1138, 6 1927 2, 6, 8, 9, // 1144, 4 1928 6, 7, 8, 9, // 1148, 4 1929 0, 2, 4, 6, 8, 10, // 1152, 6 1930 1, 2, 5, 6, 9, 10, // 1158, 6 1931 0, 3, 4, 7, 9, 10, // 1164, 6 1932 0, 1, 2, 8, 9, 10, // 1170, 6 1933 4, 5, 6, 8, 9, 10, // 1176, 6 1934 3, 11, // 1182, 2 1935 2, 3, 6, 7, 11, // 1184, 5 1936 0, 3, 8, 11, // 1189, 4 1937 0, 3, 4, 7, 8, 11, // 1193, 6 1938 1, 3, 5, 7, 9, 11, // 1199, 6 1939 2, 3, 10, 11, // 1205, 4 1940 1, 5, 10, 11, // 1209, 4 1941 4, 5, 10, 11, // 1213, 4 1942 6, 7, 10, 11, // 1217, 4 1943 2, 3, 6, 7, 10, 11, // 1221, 6 1944 1, 2, 3, 9, 10, 11, // 1227, 6 1945 5, 6, 7, 9, 10, 11, // 1233, 6 1946 8, 9, 10, 11, // 1239, 4 1947 4, 12, // 1243, 2 1948 0, 1, 2, 3, 4, 5, 8, 12, // 1245, 8 1949 8, 9, 12, // 1253, 3 1950 0, 4, 5, 8, 9, 12, // 1256, 6 1951 0, 1, 4, 5, 8, 9, 12, // 1262, 7 1952 2, 3, 5, 6, 8, 9, 12, // 1269, 7 1953 1, 5, 9, 13, // 1276, 4 1954 6, 7, 9, 13, // 1280, 4 1955 1, 4, 7, 10, 13, // 1284, 5 1956 1, 6, 8, 11, 13, // 1289, 5 1957 0, 1, 12, 13, // 1294, 4 1958 4, 5, 12, 13, // 1298, 4 1959 0, 1, 6, 7, 12, 13, // 1302, 6 1960 0, 1, 4, 8, 12, 13, // 1308, 6 1961 8, 9, 12, 13, // 1314, 4 1962 4, 8, 9, 12, 13, // 1318, 5 1963 4, 5, 8, 9, 12, 13, // 1323, 6 1964 0, 4, 5, 8, 9, 12, 13, // 1329, 7 1965 0, 1, 6, 10, 12, 13, // 1336, 6 1966 3, 6, 7, 9, 10, 12, 13, // 1342, 7 1967 0, 1, 10, 11, 12, 13, // 1349, 6 1968 2, 4, 7, 9, 14, // 1355, 5 1969 4, 5, 10, 14, // 1360, 4 1970 2, 6, 10, 14, // 1364, 4 1971 2, 5, 8, 11, 14, // 1368, 5 1972 0, 2, 12, 14, // 1373, 4 1973 8, 10, 12, 14, // 1377, 4 1974 4, 6, 8, 10, 12, 14, // 1381, 6 1975 13, 14, // 1387, 2 1976 9, 10, 13, 14, // 1389, 4 1977 5, 6, 9, 10, 13, 14, // 1393, 6 1978 0, 1, 2, 12, 13, 14, // 1399, 6 1979 4, 5, 6, 12, 13, 14, // 1405, 6 1980 8, 9, 12, 13, 14, // 1411, 5 1981 8, 9, 10, 12, 13, 14, // 1416, 6 1982 7, 15, // 1422, 2 1983 0, 5, 10, 15, // 1424, 4 1984 0, 1, 2, 3, 6, 7, 11, 15, // 1428, 8 1985 10, 11, 15, // 1436, 3 1986 0, 1, 5, 6, 10, 11, 15, // 1439, 7 1987 3, 6, 7, 10, 11, 15, // 1446, 6 1988 12, 15, // 1452, 2 1989 0, 3, 12, 15, // 1454, 4 1990 4, 7, 12, 15, // 1458, 4 1991 0, 3, 6, 9, 12, 15, // 1462, 6 1992 0, 3, 5, 10, 12, 15, // 1468, 6 1993 8, 11, 12, 15, // 1474, 4 1994 5, 6, 8, 11, 12, 15, // 1478, 6 1995 4, 7, 8, 11, 12, 15, // 1484, 6 1996 1, 3, 13, 15, // 1490, 4 1997 9, 11, 13, 15, // 1494, 4 1998 5, 7, 9, 11, 13, 15, // 1498, 6 1999 2, 3, 14, 15, // 1504, 4 2000 2, 3, 4, 5, 14, 15, // 1508, 6 2001 6, 7, 14, 15, // 1514, 4 2002 2, 3, 5, 9, 14, 15, // 1518, 6 2003 2, 3, 8, 9, 14, 15, // 1524, 6 2004 10, 14, 15, // 1530, 3 2005 0, 4, 5, 9, 10, 14, 15, // 1533, 7 2006 2, 3, 7, 11, 14, 15, // 1540, 6 2007 10, 11, 14, 15, // 1546, 4 2008 7, 10, 11, 14, 15, // 1550, 5 2009 6, 7, 10, 11, 14, 15, // 1555, 6 2010 1, 2, 3, 13, 14, 15, // 1561, 6 2011 5, 6, 7, 13, 14, 15, // 1567, 6 2012 10, 11, 13, 14, 15, // 1573, 5 2013 9, 10, 11, 13, 14, 15, // 1578, 6 2014 0, 4, 8, 9, 12, 13, 14, 15, // 1584, 8 2015 9, 10, 12, 13, 14, 15, // 1592, 6 2016 8, 11, 12, 13, 14, 15, // 1598, 6 2017 3, 7, 10, 11, 12, 13, 14, 15, // 1604, 8 2018 }; 2019 static const int g_shapeRanges[][2] = 2020 { 2021 { 0, 16 },{ 16, 4 },{ 20, 3 },{ 23, 4 },{ 27, 3 },{ 30, 4 },{ 34, 8 },{ 42, 4 },{ 46, 6 },{ 52, 8 },{ 60, 5 }, 2022 { 65, 5 },{ 70, 4 },{ 74, 4 },{ 78, 6 },{ 84, 8 },{ 92, 8 },{ 100, 8 },{ 108, 8 },{ 116, 12 },{ 128, 4 },{ 132, 8 }, 2023 { 140, 8 },{ 148, 10 },{ 158, 6 },{ 164, 8 },{ 172, 12 },{ 184, 8 },{ 192, 5 },{ 197, 3 },{ 200, 4 },{ 204, 6 },{ 210, 8 }, 2024 { 218, 8 },{ 226, 8 },{ 234, 8 },{ 242, 8 },{ 250, 12 },{ 262, 13 },{ 275, 8 },{ 283, 8 },{ 291, 10 },{ 301, 8 },{ 309, 8 }, 2025 { 317, 5 },{ 322, 8 },{ 330, 8 },{ 338, 8 },{ 346, 8 },{ 354, 8 },{ 362, 8 },{ 370, 8 },{ 378, 8 },{ 386, 8 },{ 394, 8 }, 2026 { 402, 8 },{ 410, 8 },{ 418, 4 },{ 422, 8 },{ 430, 6 },{ 436, 8 },{ 444, 10 },{ 454, 8 },{ 462, 12 },{ 474, 8 },{ 482, 8 }, 2027 { 490, 4 },{ 494, 8 },{ 502, 6 },{ 508, 8 },{ 516, 10 },{ 526, 8 },{ 534, 12 },{ 546, 8 },{ 554, 8 },{ 562, 8 },{ 570, 8 }, 2028 { 578, 8 },{ 586, 8 },{ 594, 8 },{ 602, 8 },{ 610, 8 },{ 618, 8 },{ 626, 8 },{ 634, 8 },{ 642, 11 },{ 653, 8 },{ 661, 8 }, 2029 { 669, 6 },{ 675, 8 },{ 683, 8 },{ 691, 3 },{ 694, 4 },{ 698, 8 },{ 706, 8 },{ 714, 8 },{ 722, 8 },{ 730, 8 },{ 738, 10 }, 2030 { 748, 12 },{ 760, 13 },{ 773, 11 },{ 784, 8 },{ 792, 4 },{ 796, 8 },{ 804, 10 },{ 814, 6 },{ 820, 8 },{ 828, 8 },{ 836, 12 }, 2031 { 848, 4 },{ 852, 8 },{ 860, 8 },{ 868, 8 },{ 876, 8 },{ 884, 10 },{ 894, 12 },{ 906, 12 },{ 918, 11 },{ 929, 11 },{ 940, 8 }, 2032 { 948, 10 },{ 958, 12 },{ 970, 8 },{ 978, 12 },{ 990, 13 },{ 1003, 12 },{ 1015, 13 },{ 1028, 12 },{ 1040, 2 },{ 1042, 2 },{ 1044, 4 }, 2033 { 1048, 5 },{ 1053, 3 },{ 1056, 4 },{ 1060, 4 },{ 1064, 6 },{ 1070, 6 },{ 1076, 7 },{ 1083, 4 },{ 1087, 6 },{ 1093, 4 },{ 1097, 4 }, 2034 { 1101, 5 },{ 1106, 6 },{ 1112, 7 },{ 1119, 4 },{ 1123, 2 },{ 1125, 5 },{ 1130, 4 },{ 1134, 4 },{ 1138, 6 },{ 1144, 4 },{ 1148, 4 }, 2035 { 1152, 6 },{ 1158, 6 },{ 1164, 6 },{ 1170, 6 },{ 1176, 6 },{ 1182, 2 },{ 1184, 5 },{ 1189, 4 },{ 1193, 6 },{ 1199, 6 },{ 1205, 4 }, 2036 { 1209, 4 },{ 1213, 4 },{ 1217, 4 },{ 1221, 6 },{ 1227, 6 },{ 1233, 6 },{ 1239, 4 },{ 1243, 2 },{ 1245, 8 },{ 1253, 3 },{ 1256, 6 }, 2037 { 1262, 7 },{ 1269, 7 },{ 1276, 4 },{ 1280, 4 },{ 1284, 5 },{ 1289, 5 },{ 1294, 4 },{ 1298, 4 },{ 1302, 6 },{ 1308, 6 },{ 1314, 4 }, 2038 { 1318, 5 },{ 1323, 6 },{ 1329, 7 },{ 1336, 6 },{ 1342, 7 },{ 1349, 6 },{ 1355, 5 },{ 1360, 4 },{ 1364, 4 },{ 1368, 5 },{ 1373, 4 }, 2039 { 1377, 4 },{ 1381, 6 },{ 1387, 2 },{ 1389, 4 },{ 1393, 6 },{ 1399, 6 },{ 1405, 6 },{ 1411, 5 },{ 1416, 6 },{ 1422, 2 },{ 1424, 4 }, 2040 { 1428, 8 },{ 1436, 3 },{ 1439, 7 },{ 1446, 6 },{ 1452, 2 },{ 1454, 4 },{ 1458, 4 },{ 1462, 6 },{ 1468, 6 },{ 1474, 4 },{ 1478, 6 }, 2041 { 1484, 6 },{ 1490, 4 },{ 1494, 4 },{ 1498, 6 },{ 1504, 4 },{ 1508, 6 },{ 1514, 4 },{ 1518, 6 },{ 1524, 6 },{ 1530, 3 },{ 1533, 7 }, 2042 { 1540, 6 },{ 1546, 4 },{ 1550, 5 },{ 1555, 6 },{ 1561, 6 },{ 1567, 6 },{ 1573, 5 },{ 1578, 6 },{ 1584, 8 },{ 1592, 6 },{ 1598, 6 }, 2043 { 1604, 8 }, 2044 }; 2045 static const int g_shapes1[][2] = 2046 { 2047 { 0, 16 } 2048 }; 2049 static const int g_shapes2[64][2] = 2050 { 2051 { 33, 96 },{ 63, 66 },{ 20, 109 },{ 22, 107 },{ 37, 92 },{ 7, 122 },{ 8, 121 },{ 23, 106 }, 2052 { 38, 91 },{ 2, 127 },{ 9, 120 },{ 26, 103 },{ 3, 126 },{ 6, 123 },{ 1, 128 },{ 19, 110 }, 2053 { 15, 114 },{ 124, 5 },{ 72, 57 },{ 115, 14 },{ 125, 4 },{ 70, 59 },{ 100, 29 },{ 60, 69 }, 2054 { 116, 13 },{ 99, 30 },{ 78, 51 },{ 94, 35 },{ 104, 25 },{ 111, 18 },{ 71, 58 },{ 90, 39 }, 2055 { 45, 84 },{ 16, 113 },{ 82, 47 },{ 95, 34 },{ 87, 42 },{ 83, 46 },{ 53, 76 },{ 48, 81 }, 2056 { 68, 61 },{ 105, 24 },{ 98, 31 },{ 88, 41 },{ 75, 54 },{ 43, 86 },{ 52, 77 },{ 117, 12 }, 2057 { 119, 10 },{ 118, 11 },{ 85, 44 },{ 101, 28 },{ 36, 93 },{ 55, 74 },{ 89, 40 },{ 79, 50 }, 2058 { 56, 73 },{ 49, 80 },{ 64, 65 },{ 27, 102 },{ 32, 97 },{ 112, 17 },{ 67, 62 },{ 21, 108 }, 2059 }; 2060 static const int g_shapes3[64][3] = 2061 { 2062 { 148, 160, 240 },{ 132, 212, 205 },{ 136, 233, 187 },{ 175, 237, 143 },{ 6, 186, 232 },{ 33, 142, 232 },{ 131, 123, 142 },{ 131, 96, 186 }, 2063 { 6, 171, 110 },{ 1, 18, 110 },{ 1, 146, 123 },{ 33, 195, 66 },{ 20, 51, 66 },{ 20, 178, 96 },{ 2, 177, 106 },{ 211, 4, 59 }, 2064 { 8, 191, 91 },{ 230, 14, 29 },{ 1, 188, 234 },{ 151, 110, 168 },{ 20, 144, 238 },{ 137, 66, 206 },{ 173, 179, 232 },{ 209, 194, 186 }, 2065 { 239, 165, 142 },{ 131, 152, 242 },{ 214, 54, 12 },{ 140, 219, 201 },{ 190, 150, 231 },{ 156, 135, 241 },{ 185, 227, 167 },{ 145, 210, 59 }, 2066 { 138, 174, 106 },{ 189, 229, 14 },{ 176, 133, 106 },{ 78, 178, 195 },{ 111, 146, 171 },{ 216, 180, 196 },{ 217, 181, 193 },{ 184, 228, 166 }, 2067 { 192, 225, 153 },{ 134, 141, 123 },{ 6, 222, 198 },{ 149, 183, 96 },{ 33, 226, 164 },{ 161, 215, 51 },{ 197, 221, 18 },{ 1, 223, 199 }, 2068 { 154, 163, 110 },{ 20, 236, 169 },{ 157, 204, 66 },{ 1, 202, 220 },{ 20, 170, 235 },{ 203, 158, 66 },{ 162, 155, 110 },{ 6, 201, 218 }, 2069 { 139, 135, 123 },{ 33, 167, 224 },{ 182, 150, 96 },{ 19, 200, 213 },{ 63, 207, 159 },{ 147, 172, 109 },{ 129, 130, 128 },{ 208, 14, 59 }, 2070 }; 2071 2072 static const int g_shapeList1[] = 2073 { 2074 0, 2075 }; 2076 2077 static const int g_shapeList1Collapse[] = 2078 { 2079 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2080 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2081 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2082 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2083 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2084 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2085 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2086 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2087 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2088 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2089 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2090 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2091 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2092 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2093 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2094 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2095 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2096 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2097 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2098 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2099 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2100 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2101 -1, 2102 }; 2103 static const int g_shapeList2[] = 2104 { 2105 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2106 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 2107 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 2108 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 2109 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 2110 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 2111 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 2112 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 2113 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 2114 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 2115 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 2116 122, 123, 124, 125, 126, 127, 128, 2117 }; 2118 static const int g_shapeList2Collapse[] = 2119 { 2120 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 2121 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 2122 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 2123 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 2124 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 2125 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 2126 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 2127 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 2128 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 2129 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 2130 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 2131 120, 121, 122, 123, 124, 125, 126, 127, -1, -1, -1, 2132 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2133 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2134 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2135 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2136 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2137 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2138 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2139 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2140 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2141 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2142 -1, 2143 }; 2144 2145 static const int g_shapeList12[] = 2146 { 2147 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 2148 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 2149 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 2150 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 2151 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 2152 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 2153 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 2154 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 2155 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 2156 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 2157 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 2158 121, 122, 123, 124, 125, 126, 127, 128, 2159 }; 2160 2161 static const int g_shapeList12Collapse[] = 2162 { 2163 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 2164 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 2165 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 2166 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 2167 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 2168 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 2169 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 2170 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 2171 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 2172 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 2173 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 2174 121, 122, 123, 124, 125, 126, 127, 128, -1, -1, -1, 2175 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2176 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2177 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2178 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2179 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2180 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2181 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2182 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2183 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2184 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2185 -1, 2186 }; 2187 2188 static const int g_shapeList3[] = 2189 { 2190 1, 2, 4, 6, 8, 12, 14, 18, 19, 20, 29, 2191 33, 51, 54, 59, 63, 66, 78, 91, 96, 106, 109, 2192 110, 111, 123, 128, 129, 130, 131, 132, 133, 134, 135, 2193 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 2194 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 2195 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 2196 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 2197 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 2198 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 2199 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 2200 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 2201 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 2202 235, 236, 237, 238, 239, 240, 241, 242, 2203 }; 2204 2205 static const int g_shapeList3Collapse[] = 2206 { 2207 -1, 0, 1, -1, 2, -1, 3, -1, 4, -1, -1, 2208 -1, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1, 2209 -1, -1, -1, -1, -1, -1, -1, 10, -1, -1, -1, 2210 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2211 -1, -1, -1, -1, -1, -1, -1, 12, -1, -1, 13, 2212 -1, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1, 2213 16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2214 -1, 17, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2215 -1, -1, -1, 18, -1, -1, -1, -1, 19, -1, -1, 2216 -1, -1, -1, -1, -1, -1, -1, 20, -1, -1, 21, 2217 22, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2218 -1, -1, 24, -1, -1, -1, -1, 25, 26, 27, 28, 2219 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 2220 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 2221 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 2222 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 2223 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 2224 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 2225 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 2226 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 2227 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 2228 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 2229 139, 2230 }; 2231 2232 static const int g_shapeList3Short[] = 2233 { 2234 1, 2, 4, 6, 18, 20, 33, 51, 59, 66, 96, 2235 106, 110, 123, 131, 132, 136, 142, 143, 146, 148, 160, 2236 171, 175, 177, 178, 186, 187, 195, 205, 211, 212, 232, 2237 233, 237, 240, 2238 }; 2239 2240 static const int g_shapeList3ShortCollapse[] = 2241 { 2242 -1, 0, 1, -1, 2, -1, 3, -1, -1, -1, -1, 2243 -1, -1, -1, -1, -1, -1, -1, 4, -1, 5, -1, 2244 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2245 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2246 -1, -1, -1, -1, -1, -1, -1, 7, -1, -1, -1, 2247 -1, -1, -1, -1, 8, -1, -1, -1, -1, -1, -1, 2248 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2249 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2250 -1, -1, -1, -1, -1, -1, -1, -1, 10, -1, -1, 2251 -1, -1, -1, -1, -1, -1, -1, 11, -1, -1, -1, 2252 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2253 -1, -1, 13, -1, -1, -1, -1, -1, -1, -1, 14, 2254 15, -1, -1, -1, 16, -1, -1, -1, -1, -1, 17, 2255 18, -1, -1, 19, -1, 20, -1, -1, -1, -1, -1, 2256 -1, -1, -1, -1, -1, -1, 21, -1, -1, -1, -1, 2257 -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, 23, 2258 -1, 24, 25, -1, -1, -1, -1, -1, -1, -1, 26, 2259 27, -1, -1, -1, -1, -1, -1, -1, 28, -1, -1, 2260 -1, -1, -1, -1, -1, -1, -1, 29, -1, -1, -1, 2261 -1, -1, 30, 31, -1, -1, -1, -1, -1, -1, -1, 2262 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2263 -1, 32, 33, -1, -1, -1, 34, -1, -1, 35, -1, 2264 -1, 2265 }; 2266 2267 static const int g_shapeListAll[] = 2268 { 2269 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 2270 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 2271 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 2272 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 2273 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 2274 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 2275 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 2276 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 2277 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 2278 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 2279 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 2280 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 2281 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 2282 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 2283 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 2284 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 2285 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 2286 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 2287 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 2288 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 2289 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 2290 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 2291 242, 2292 }; 2293 2294 static const int g_numShapes1 = sizeof(g_shapeList1) / sizeof(g_shapeList1[0]); 2295 static const int g_numShapes2 = sizeof(g_shapeList2) / sizeof(g_shapeList2[0]); 2296 static const int g_numShapes12 = sizeof(g_shapeList12) / sizeof(g_shapeList12[0]); 2297 static const int g_numShapes3 = sizeof(g_shapeList3) / sizeof(g_shapeList3[0]); 2298 static const int g_numShapes3Short = sizeof(g_shapeList3Short) / sizeof(g_shapeList3Short[0]); 2299 static const int g_numShapesAll = sizeof(g_shapeListAll) / sizeof(g_shapeListAll[0]); 2300 static const int g_numFragments = sizeof(g_fragments) / sizeof(g_fragments[0]); 2301 2302 static const int g_maxFragmentsPerMode = (g_numShapes2 > g_numShapes3) ? g_numShapes2 : g_numShapes3; 2303 } 2304 2305 namespace BC6HData 2306 { 2307 enum EField 2308 { 2309 NA, // N/A 2310 M, // Mode 2311 D, // Shape 2312 RW, 2313 RX, 2314 RY, 2315 RZ, 2316 GW, 2317 GX, 2318 GY, 2319 GZ, 2320 BW, 2321 BX, 2322 BY, 2323 BZ, 2324 }; 2325 2326 struct ModeDescriptor 2327 { 2328 EField m_eField; 2329 uint8_t m_uBit; 2330 }; 2331 2332 const ModeDescriptor g_modeDescriptors[14][82] = 2333 { 2334 { // Mode 1 (0x00) - 10 5 5 5 2335 { M, 0 },{ M, 1 },{ GY, 4 },{ BY, 4 },{ BZ, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 }, 2336 { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 }, 2337 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 }, 2338 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 }, 2339 { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 }, 2340 { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 }, 2341 { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 }, 2342 { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 }, 2343 { D, 3 },{ D, 4 }, 2344 }, 2345 2346 { // Mode 2 (0x01) - 7 6 6 6 2347 { M, 0 },{ M, 1 },{ GY, 5 },{ GZ, 4 },{ GZ, 5 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 }, 2348 { RW, 5 },{ RW, 6 },{ BZ, 0 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 }, 2349 { GW, 5 },{ GW, 6 },{ BY, 5 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 }, 2350 { BW, 5 },{ BW, 6 },{ BZ, 3 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 }, 2351 { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 }, 2352 { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 }, 2353 { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 }, 2354 { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 }, 2355 { D, 3 },{ D, 4 }, 2356 }, 2357 2358 { // Mode 3 (0x02) - 11 5 4 4 2359 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 }, 2360 { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 }, 2361 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 }, 2362 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 }, 2363 { RW,10 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,10 }, 2364 { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,10 }, 2365 { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 }, 2366 { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 }, 2367 { D, 3 },{ D, 4 }, 2368 }, 2369 2370 { // Mode 4 (0x06) - 11 4 5 4 2371 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 }, 2372 { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 }, 2373 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 }, 2374 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,10 }, 2375 { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 }, 2376 { GW,10 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,10 }, 2377 { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ BZ, 0 }, 2378 { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ GY, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 }, 2379 { D, 3 },{ D, 4 }, 2380 }, 2381 2382 { // Mode 5 (0x0a) - 11 4 4 5 2383 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 }, 2384 { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 }, 2385 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 }, 2386 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,10 }, 2387 { BY, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,10 }, 2388 { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 }, 2389 { BW,10 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ BZ, 1 }, 2390 { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ BZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 }, 2391 { D, 3 },{ D, 4 }, 2392 }, 2393 2394 { // Mode 6 (0x0e) - 9 5 5 5 2395 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 }, 2396 { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 }, 2397 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 }, 2398 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 }, 2399 { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 }, 2400 { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 }, 2401 { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 }, 2402 { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 }, 2403 { D, 3 },{ D, 4 }, 2404 }, 2405 2406 { // Mode 7 (0x12) - 8 6 5 5 2407 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 }, 2408 { RW, 5 },{ RW, 6 },{ RW, 7 },{ GZ, 4 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 }, 2409 { GW, 5 },{ GW, 6 },{ GW, 7 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 }, 2410 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BZ, 3 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 }, 2411 { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 }, 2412 { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 }, 2413 { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 }, 2414 { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 }, 2415 { D, 3 },{ D, 4 }, 2416 }, 2417 2418 { // Mode 8 (0x16) - 8 5 6 5 2419 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 }, 2420 { RW, 5 },{ RW, 6 },{ RW, 7 },{ BZ, 0 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 }, 2421 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GY, 5 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 }, 2422 { BW, 5 },{ BW, 6 },{ BW, 7 },{ GZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 }, 2423 { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 }, 2424 { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 }, 2425 { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 }, 2426 { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 }, 2427 { D, 3 },{ D, 4 }, 2428 }, 2429 2430 { // Mode 9 (0x1a) - 8 5 5 6 2431 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 }, 2432 { RW, 5 },{ RW, 6 },{ RW, 7 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 }, 2433 { GW, 5 },{ GW, 6 },{ GW, 7 },{ BY, 5 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 }, 2434 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 }, 2435 { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 }, 2436 { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 }, 2437 { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 }, 2438 { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 }, 2439 { D, 3 },{ D, 4 }, 2440 }, 2441 2442 { // Mode 10 (0x1e) - 6 6 6 6 2443 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 }, 2444 { RW, 5 },{ GZ, 4 },{ BZ, 0 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 }, 2445 { GW, 5 },{ GY, 5 },{ BY, 5 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 }, 2446 { BW, 5 },{ GZ, 5 },{ BZ, 3 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 }, 2447 { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 }, 2448 { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 }, 2449 { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 }, 2450 { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 }, 2451 { D, 3 },{ D, 4 }, 2452 }, 2453 2454 { // Mode 11 (0x03) - 10 10 2455 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 }, 2456 { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 }, 2457 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 }, 2458 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 }, 2459 { RX, 5 },{ RX, 6 },{ RX, 7 },{ RX, 8 },{ RX, 9 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 }, 2460 { GX, 5 },{ GX, 6 },{ GX, 7 },{ GX, 8 },{ GX, 9 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 }, 2461 { BX, 5 },{ BX, 6 },{ BX, 7 },{ BX, 8 },{ BX, 9 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 }, 2462 { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 }, 2463 { NA, 0 },{ NA, 0 }, 2464 }, 2465 2466 { // Mode 12 (0x07) - 11 9 2467 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 }, 2468 { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 }, 2469 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 }, 2470 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 }, 2471 { RX, 5 },{ RX, 6 },{ RX, 7 },{ RX, 8 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 }, 2472 { GX, 5 },{ GX, 6 },{ GX, 7 },{ GX, 8 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 }, 2473 { BX, 5 },{ BX, 6 },{ BX, 7 },{ BX, 8 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 }, 2474 { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 }, 2475 { NA, 0 },{ NA, 0 }, 2476 }, 2477 2478 { // Mode 13 (0x0b) - 12 8 2479 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 }, 2480 { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 }, 2481 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 }, 2482 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 }, 2483 { RX, 5 },{ RX, 6 },{ RX, 7 },{ RW,11 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 }, 2484 { GX, 5 },{ GX, 6 },{ GX, 7 },{ GW,11 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 }, 2485 { BX, 5 },{ BX, 6 },{ BX, 7 },{ BW,11 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 }, 2486 { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 }, 2487 { NA, 0 },{ NA, 0 }, 2488 }, 2489 2490 { // Mode 14 (0x0f) - 16 4 2491 { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 }, 2492 { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 }, 2493 { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 }, 2494 { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,15 }, 2495 { RW,14 },{ RW,13 },{ RW,12 },{ RW,11 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,15 }, 2496 { GW,14 },{ GW,13 },{ GW,12 },{ GW,11 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,15 }, 2497 { BW,14 },{ BW,13 },{ BW,12 },{ BW,11 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 }, 2498 { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 }, 2499 { NA, 0 },{ NA, 0 }, 2500 }, 2501 }; 2502 } 2503 2504 struct PackingVector 2505 { 2506 uint32_t m_vector[4]; 2507 int m_offset; 2508 Initcvtt::Internal::PackingVector2509 void Init() 2510 { 2511 for (int i = 0; i < 4; i++) 2512 m_vector[i] = 0; 2513 2514 m_offset = 0; 2515 } 2516 Packcvtt::Internal::PackingVector2517 inline void Pack(ParallelMath::ScalarUInt16 value, int bits) 2518 { 2519 int vOffset = m_offset >> 5; 2520 int bitOffset = m_offset & 0x1f; 2521 2522 m_vector[vOffset] |= (static_cast<uint32_t>(value) << bitOffset) & static_cast<uint32_t>(0xffffffff); 2523 2524 int overflowBits = bitOffset + bits - 32; 2525 if (overflowBits > 0) 2526 m_vector[vOffset + 1] |= (static_cast<uint32_t>(value) >> (bits - overflowBits)); 2527 2528 m_offset += bits; 2529 } 2530 Flushcvtt::Internal::PackingVector2531 inline void Flush(uint8_t* output) 2532 { 2533 assert(m_offset == 128); 2534 2535 for (int v = 0; v < 4; v++) 2536 { 2537 uint32_t chunk = m_vector[v]; 2538 for (int b = 0; b < 4; b++) 2539 output[v * 4 + b] = static_cast<uint8_t>((chunk >> (b * 8)) & 0xff); 2540 } 2541 } 2542 }; 2543 2544 2545 struct UnpackingVector 2546 { 2547 uint32_t m_vector[4]; 2548 Initcvtt::Internal::UnpackingVector2549 void Init(const uint8_t *bytes) 2550 { 2551 for (int i = 0; i < 4; i++) 2552 m_vector[i] = 0; 2553 2554 for (int b = 0; b < 16; b++) 2555 m_vector[b / 4] |= (bytes[b] << ((b % 4) * 8)); 2556 } 2557 Unpackcvtt::Internal::UnpackingVector2558 inline ParallelMath::ScalarUInt16 Unpack(int bits) 2559 { 2560 uint32_t bitMask = (1 << bits) - 1; 2561 2562 ParallelMath::ScalarUInt16 result = static_cast<ParallelMath::ScalarUInt16>(m_vector[0] & bitMask); 2563 2564 for (int i = 0; i < 4; i++) 2565 { 2566 m_vector[i] >>= bits; 2567 if (i != 3) 2568 m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - bits); 2569 } 2570 2571 return result; 2572 } 2573 }; 2574 ComputeTweakFactors(int tweak,int range,float * outFactors)2575 void ComputeTweakFactors(int tweak, int range, float *outFactors) 2576 { 2577 int totalUnits = range - 1; 2578 int minOutsideUnits = ((tweak >> 1) & 1); 2579 int maxOutsideUnits = (tweak & 1); 2580 int insideUnits = totalUnits - minOutsideUnits - maxOutsideUnits; 2581 2582 outFactors[0] = -static_cast<float>(minOutsideUnits) / static_cast<float>(insideUnits); 2583 outFactors[1] = static_cast<float>(maxOutsideUnits) / static_cast<float>(insideUnits) + 1.0f; 2584 } 2585 ScaleHDRValue(const ParallelMath::Float & v,bool isSigned)2586 ParallelMath::Float ScaleHDRValue(const ParallelMath::Float &v, bool isSigned) 2587 { 2588 if (isSigned) 2589 { 2590 ParallelMath::Float offset = ParallelMath::Select(ParallelMath::Less(v, ParallelMath::MakeFloatZero()), ParallelMath::MakeFloat(-30.0f), ParallelMath::MakeFloat(30.0f)); 2591 return (v * 32.0f + offset) / 31.0f; 2592 } 2593 else 2594 return (v * 64.0f + 30.0f) / 31.0f; 2595 } 2596 UnscaleHDRValueSigned(const ParallelMath::SInt16 & v)2597 ParallelMath::SInt16 UnscaleHDRValueSigned(const ParallelMath::SInt16 &v) 2598 { 2599 #ifdef CVTT_ENABLE_ASSERTS 2600 for (int i = 0; i < ParallelMath::ParallelSize; i++) 2601 assert(ParallelMath::Extract(v, i) != -32768) 2602 #endif 2603 2604 ParallelMath::Int16CompFlag negative = ParallelMath::Less(v, ParallelMath::MakeSInt16(0)); 2605 ParallelMath::UInt15 absComp = ParallelMath::LosslessCast<ParallelMath::UInt15>::Cast(ParallelMath::Select(negative, ParallelMath::SInt16(ParallelMath::MakeSInt16(0) - v), v)); 2606 2607 ParallelMath::UInt31 multiplied = ParallelMath::XMultiply(absComp, ParallelMath::MakeUInt15(31)); 2608 ParallelMath::UInt31 shifted = ParallelMath::RightShift(multiplied, 5); 2609 ParallelMath::UInt15 absCompScaled = ParallelMath::ToUInt15(shifted); 2610 ParallelMath::SInt16 signBits = ParallelMath::SelectOrZero(negative, ParallelMath::MakeSInt16(-32768)); 2611 2612 return ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(absCompScaled) | signBits; 2613 } 2614 UnscaleHDRValueUnsigned(const ParallelMath::UInt16 & v)2615 ParallelMath::UInt15 UnscaleHDRValueUnsigned(const ParallelMath::UInt16 &v) 2616 { 2617 return ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(v, ParallelMath::MakeUInt15(31)), 6)); 2618 } 2619 UnscaleHDREndpoints(const ParallelMath::AInt16 inEP[2][3],ParallelMath::AInt16 outEP[2][3],bool isSigned)2620 void UnscaleHDREndpoints(const ParallelMath::AInt16 inEP[2][3], ParallelMath::AInt16 outEP[2][3], bool isSigned) 2621 { 2622 for (int epi = 0; epi < 2; epi++) 2623 { 2624 for (int ch = 0; ch < 3; ch++) 2625 { 2626 if (isSigned) 2627 outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueSigned(ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(inEP[epi][ch]))); 2628 else 2629 outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::LosslessCast<ParallelMath::UInt16>::Cast(inEP[epi][ch]))); 2630 } 2631 } 2632 } 2633 2634 template<int TVectorSize> 2635 class UnfinishedEndpoints 2636 { 2637 public: 2638 typedef ParallelMath::Float MFloat; 2639 typedef ParallelMath::UInt16 MUInt16; 2640 typedef ParallelMath::UInt15 MUInt15; 2641 typedef ParallelMath::SInt16 MSInt16; 2642 typedef ParallelMath::SInt32 MSInt32; 2643 UnfinishedEndpoints()2644 UnfinishedEndpoints() 2645 { 2646 } 2647 UnfinishedEndpoints(const MFloat * base,const MFloat * offset)2648 UnfinishedEndpoints(const MFloat *base, const MFloat *offset) 2649 { 2650 for (int ch = 0; ch < TVectorSize; ch++) 2651 m_base[ch] = base[ch]; 2652 for (int ch = 0; ch < TVectorSize; ch++) 2653 m_offset[ch] = offset[ch]; 2654 } 2655 UnfinishedEndpoints(const UnfinishedEndpoints & other)2656 UnfinishedEndpoints(const UnfinishedEndpoints& other) 2657 { 2658 for (int ch = 0; ch < TVectorSize; ch++) 2659 m_base[ch] = other.m_base[ch]; 2660 for (int ch = 0; ch < TVectorSize; ch++) 2661 m_offset[ch] = other.m_offset[ch]; 2662 } 2663 FinishHDRUnsigned(int tweak,int range,MSInt16 * outEP0,MSInt16 * outEP1,ParallelMath::RoundTowardNearestForScope * roundingMode)2664 void FinishHDRUnsigned(int tweak, int range, MSInt16 *outEP0, MSInt16 *outEP1, ParallelMath::RoundTowardNearestForScope *roundingMode) 2665 { 2666 float tweakFactors[2]; 2667 ComputeTweakFactors(tweak, range, tweakFactors); 2668 2669 for (int ch = 0; ch < TVectorSize; ch++) 2670 { 2671 MUInt15 channelEPs[2]; 2672 for (int epi = 0; epi < 2; epi++) 2673 { 2674 MFloat f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[epi], 0.0f, 31743.0f); 2675 channelEPs[epi] = ParallelMath::RoundAndConvertToU15(f, roundingMode); 2676 } 2677 2678 outEP0[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(channelEPs[0]); 2679 outEP1[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(channelEPs[1]); 2680 } 2681 } 2682 FinishHDRSigned(int tweak,int range,MSInt16 * outEP0,MSInt16 * outEP1,ParallelMath::RoundTowardNearestForScope * roundingMode)2683 void FinishHDRSigned(int tweak, int range, MSInt16* outEP0, MSInt16* outEP1, ParallelMath::RoundTowardNearestForScope* roundingMode) 2684 { 2685 float tweakFactors[2]; 2686 ComputeTweakFactors(tweak, range, tweakFactors); 2687 2688 for (int ch = 0; ch < TVectorSize; ch++) 2689 { 2690 MSInt16 channelEPs[2]; 2691 for (int epi = 0; epi < 2; epi++) 2692 { 2693 MFloat f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[epi], -31743.0f, 31743.0f); 2694 channelEPs[epi] = ParallelMath::RoundAndConvertToS16(f, roundingMode); 2695 } 2696 2697 outEP0[ch] = channelEPs[0]; 2698 outEP1[ch] = channelEPs[1]; 2699 } 2700 } 2701 FinishLDR(int tweak,int range,MUInt15 * outEP0,MUInt15 * outEP1)2702 void FinishLDR(int tweak, int range, MUInt15* outEP0, MUInt15* outEP1) 2703 { 2704 ParallelMath::RoundTowardNearestForScope roundingMode; 2705 2706 float tweakFactors[2]; 2707 ComputeTweakFactors(tweak, range, tweakFactors); 2708 2709 for (int ch = 0; ch < TVectorSize; ch++) 2710 { 2711 MFloat ep0f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[0], 0.0f, 255.0f); 2712 MFloat ep1f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[1], 0.0f, 255.0f); 2713 outEP0[ch] = ParallelMath::RoundAndConvertToU15(ep0f, &roundingMode); 2714 outEP1[ch] = ParallelMath::RoundAndConvertToU15(ep1f, &roundingMode); 2715 } 2716 } 2717 2718 template<int TNewVectorSize> ExpandTo(float filler)2719 UnfinishedEndpoints<TNewVectorSize> ExpandTo(float filler) 2720 { 2721 MFloat newBase[TNewVectorSize]; 2722 MFloat newOffset[TNewVectorSize]; 2723 2724 for (int ch = 0; ch < TNewVectorSize && ch < TVectorSize; ch++) 2725 { 2726 newBase[ch] = m_base[ch]; 2727 newOffset[ch] = m_offset[ch]; 2728 } 2729 2730 MFloat fillerV = ParallelMath::MakeFloat(filler); 2731 2732 for (int ch = TVectorSize; ch < TNewVectorSize; ch++) 2733 { 2734 newBase[ch] = fillerV; 2735 newOffset[ch] = ParallelMath::MakeFloatZero(); 2736 } 2737 2738 return UnfinishedEndpoints<TNewVectorSize>(newBase, newOffset); 2739 } 2740 2741 private: 2742 MFloat m_base[TVectorSize]; 2743 MFloat m_offset[TVectorSize]; 2744 }; 2745 2746 template<int TMatrixSize> 2747 class PackedCovarianceMatrix 2748 { 2749 public: 2750 // 0: xx, 2751 // 1: xy, yy 2752 // 3: xz, yz, zz 2753 // 6: xw, yw, zw, ww 2754 // ... etc. 2755 static const int PyramidSize = (TMatrixSize * (TMatrixSize + 1)) / 2; 2756 2757 typedef ParallelMath::Float MFloat; 2758 PackedCovarianceMatrix()2759 PackedCovarianceMatrix() 2760 { 2761 for (int i = 0; i < PyramidSize; i++) 2762 m_values[i] = ParallelMath::MakeFloatZero(); 2763 } 2764 Add(const ParallelMath::Float * vec,const ParallelMath::Float & weight)2765 void Add(const ParallelMath::Float *vec, const ParallelMath::Float &weight) 2766 { 2767 int index = 0; 2768 for (int row = 0; row < TMatrixSize; row++) 2769 { 2770 for (int col = 0; col <= row; col++) 2771 { 2772 m_values[index] = m_values[index] + vec[row] * vec[col] * weight; 2773 index++; 2774 } 2775 } 2776 } 2777 Product(MFloat * outVec,const MFloat * inVec)2778 void Product(MFloat *outVec, const MFloat *inVec) 2779 { 2780 for (int row = 0; row < TMatrixSize; row++) 2781 { 2782 MFloat sum = ParallelMath::MakeFloatZero(); 2783 2784 int index = (row * (row + 1)) >> 1; 2785 for (int col = 0; col < TMatrixSize; col++) 2786 { 2787 sum = sum + inVec[col] * m_values[index]; 2788 if (col >= row) 2789 index += col + 1; 2790 else 2791 index++; 2792 } 2793 2794 outVec[row] = sum; 2795 } 2796 } 2797 2798 private: 2799 ParallelMath::Float m_values[PyramidSize]; 2800 }; 2801 2802 static const int NumEndpointSelectorPasses = 3; 2803 2804 template<int TVectorSize, int TIterationCount> 2805 class EndpointSelector 2806 { 2807 public: 2808 typedef ParallelMath::Float MFloat; 2809 EndpointSelector()2810 EndpointSelector() 2811 { 2812 for (int ch = 0; ch < TVectorSize; ch++) 2813 { 2814 m_centroid[ch] = ParallelMath::MakeFloatZero(); 2815 m_direction[ch] = ParallelMath::MakeFloatZero(); 2816 } 2817 m_weightTotal = ParallelMath::MakeFloatZero(); 2818 m_minDist = ParallelMath::MakeFloat(FLT_MAX); 2819 m_maxDist = ParallelMath::MakeFloat(-FLT_MAX); 2820 } 2821 ContributePass(const MFloat * value,int pass,const MFloat & weight)2822 void ContributePass(const MFloat *value, int pass, const MFloat &weight) 2823 { 2824 if (pass == 0) 2825 ContributeCentroid(value, weight); 2826 else if (pass == 1) 2827 ContributeDirection(value, weight); 2828 else if (pass == 2) 2829 ContributeMinMax(value); 2830 } 2831 FinishPass(int pass)2832 void FinishPass(int pass) 2833 { 2834 if (pass == 0) 2835 FinishCentroid(); 2836 else if (pass == 1) 2837 FinishDirection(); 2838 } 2839 GetEndpoints(const float channelWeights[TVectorSize]) const2840 UnfinishedEndpoints<TVectorSize> GetEndpoints(const float channelWeights[TVectorSize]) const 2841 { 2842 MFloat unweightedBase[TVectorSize]; 2843 MFloat unweightedOffset[TVectorSize]; 2844 2845 for (int ch = 0; ch < TVectorSize; ch++) 2846 { 2847 MFloat min = m_centroid[ch] + m_direction[ch] * m_minDist; 2848 MFloat max = m_centroid[ch] + m_direction[ch] * m_maxDist; 2849 2850 float safeWeight = channelWeights[ch]; 2851 if (safeWeight == 0.f) 2852 safeWeight = 1.0f; 2853 2854 unweightedBase[ch] = min / channelWeights[ch]; 2855 unweightedOffset[ch] = (max - min) / channelWeights[ch]; 2856 } 2857 2858 return UnfinishedEndpoints<TVectorSize>(unweightedBase, unweightedOffset); 2859 } 2860 2861 private: ContributeCentroid(const MFloat * value,const MFloat & weight)2862 void ContributeCentroid(const MFloat *value, const MFloat &weight) 2863 { 2864 for (int ch = 0; ch < TVectorSize; ch++) 2865 m_centroid[ch] = m_centroid[ch] + value[ch] * weight; 2866 m_weightTotal = m_weightTotal + weight; 2867 } 2868 FinishCentroid()2869 void FinishCentroid() 2870 { 2871 MFloat denom = m_weightTotal; 2872 ParallelMath::MakeSafeDenominator(denom); 2873 2874 for (int ch = 0; ch < TVectorSize; ch++) 2875 m_centroid[ch] = m_centroid[ch] / denom; 2876 } 2877 ContributeDirection(const MFloat * value,const MFloat & weight)2878 void ContributeDirection(const MFloat *value, const MFloat &weight) 2879 { 2880 MFloat diff[TVectorSize]; 2881 for (int ch = 0; ch < TVectorSize; ch++) 2882 diff[ch] = value[ch] - m_centroid[ch]; 2883 2884 m_covarianceMatrix.Add(diff, weight); 2885 } 2886 FinishDirection()2887 void FinishDirection() 2888 { 2889 MFloat approx[TVectorSize]; 2890 for (int ch = 0; ch < TVectorSize; ch++) 2891 approx[ch] = ParallelMath::MakeFloat(1.0f); 2892 2893 for (int i = 0; i < TIterationCount; i++) 2894 { 2895 MFloat product[TVectorSize]; 2896 m_covarianceMatrix.Product(product, approx); 2897 2898 MFloat largestComponent = product[0]; 2899 for (int ch = 1; ch < TVectorSize; ch++) 2900 largestComponent = ParallelMath::Max(largestComponent, product[ch]); 2901 2902 // product = largestComponent*newApprox 2903 ParallelMath::MakeSafeDenominator(largestComponent); 2904 for (int ch = 0; ch < TVectorSize; ch++) 2905 approx[ch] = product[ch] / largestComponent; 2906 } 2907 2908 // Normalize 2909 MFloat approxLen = ParallelMath::MakeFloatZero(); 2910 for (int ch = 0; ch < TVectorSize; ch++) 2911 approxLen = approxLen + approx[ch] * approx[ch]; 2912 2913 approxLen = ParallelMath::Sqrt(approxLen); 2914 2915 ParallelMath::MakeSafeDenominator(approxLen); 2916 2917 for (int ch = 0; ch < TVectorSize; ch++) 2918 m_direction[ch] = approx[ch] / approxLen; 2919 } 2920 ContributeMinMax(const MFloat * value)2921 void ContributeMinMax(const MFloat *value) 2922 { 2923 MFloat dist = ParallelMath::MakeFloatZero(); 2924 for (int ch = 0; ch < TVectorSize; ch++) 2925 dist = dist + m_direction[ch] * (value[ch] - m_centroid[ch]); 2926 2927 m_minDist = ParallelMath::Min(m_minDist, dist); 2928 m_maxDist = ParallelMath::Max(m_maxDist, dist); 2929 } 2930 2931 ParallelMath::Float m_centroid[TVectorSize]; 2932 ParallelMath::Float m_direction[TVectorSize]; 2933 PackedCovarianceMatrix<TVectorSize> m_covarianceMatrix; 2934 ParallelMath::Float m_weightTotal; 2935 2936 ParallelMath::Float m_minDist; 2937 ParallelMath::Float m_maxDist; 2938 }; 2939 2940 static const ParallelMath::UInt16 g_weightReciprocals[] = 2941 { 2942 ParallelMath::MakeUInt16(0), // -1 2943 ParallelMath::MakeUInt16(0), // 0 2944 ParallelMath::MakeUInt16(32768), // 1 2945 ParallelMath::MakeUInt16(16384), // 2 2946 ParallelMath::MakeUInt16(10923), // 3 2947 ParallelMath::MakeUInt16(8192), // 4 2948 ParallelMath::MakeUInt16(6554), // 5 2949 ParallelMath::MakeUInt16(5461), // 6 2950 ParallelMath::MakeUInt16(4681), // 7 2951 ParallelMath::MakeUInt16(4096), // 8 2952 ParallelMath::MakeUInt16(3641), // 9 2953 ParallelMath::MakeUInt16(3277), // 10 2954 ParallelMath::MakeUInt16(2979), // 11 2955 ParallelMath::MakeUInt16(2731), // 12 2956 ParallelMath::MakeUInt16(2521), // 13 2957 ParallelMath::MakeUInt16(2341), // 14 2958 ParallelMath::MakeUInt16(2185), // 15 2959 }; 2960 2961 template<int TVectorSize> 2962 class IndexSelector 2963 { 2964 public: 2965 typedef ParallelMath::Float MFloat; 2966 typedef ParallelMath::UInt16 MUInt16; 2967 typedef ParallelMath::UInt15 MUInt15; 2968 typedef ParallelMath::SInt16 MSInt16; 2969 typedef ParallelMath::AInt16 MAInt16; 2970 typedef ParallelMath::SInt32 MSInt32; 2971 typedef ParallelMath::UInt31 MUInt31; 2972 2973 template<class TInterpolationEPType, class TColorEPType> Init(const float * channelWeights,const TInterpolationEPType interpolationEndPoints[2][TVectorSize],const TColorEPType colorSpaceEndpoints[2][TVectorSize],int range)2974 void Init(const float *channelWeights, const TInterpolationEPType interpolationEndPoints[2][TVectorSize], const TColorEPType colorSpaceEndpoints[2][TVectorSize], int range) 2975 { 2976 // In BC6H, the interpolation endpoints are higher-precision than the endpoints in color space. 2977 // We need to select indexes using the color-space endpoints. 2978 2979 m_isUniform = true; 2980 for (int ch = 1; ch < TVectorSize; ch++) 2981 { 2982 if (channelWeights[ch] != channelWeights[0]) 2983 m_isUniform = false; 2984 } 2985 2986 // To work with channel weights, we need something where: 2987 // pxDiff = px - ep[0] 2988 // epDiff = ep[1] - ep[0] 2989 // 2990 // weightedEPDiff = epDiff * channelWeights 2991 // normalizedWeightedAxis = weightedEPDiff / len(weightedEPDiff) 2992 // normalizedIndex = dot(pxDiff * channelWeights, normalizedWeightedAxis) / len(weightedEPDiff) 2993 // index = normalizedIndex * maxValue 2994 // 2995 // Equivalent to: 2996 // axis = channelWeights * maxValue * epDiff * channelWeights / lenSquared(epDiff * channelWeights) 2997 // index = dot(axis, pxDiff) 2998 2999 for (int ep = 0; ep < 2; ep++) 3000 for (int ch = 0; ch < TVectorSize; ch++) 3001 m_endPoint[ep][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(interpolationEndPoints[ep][ch]); 3002 3003 m_range = range; 3004 m_maxValue = static_cast<float>(range - 1); 3005 3006 MFloat epDiffWeighted[TVectorSize]; 3007 for (int ch = 0; ch < TVectorSize; ch++) 3008 { 3009 m_origin[ch] = ParallelMath::ToFloat(colorSpaceEndpoints[0][ch]); 3010 MFloat opposingOriginCh = ParallelMath::ToFloat(colorSpaceEndpoints[1][ch]); 3011 epDiffWeighted[ch] = (opposingOriginCh - m_origin[ch]) * channelWeights[ch]; 3012 } 3013 3014 MFloat lenSquared = epDiffWeighted[0] * epDiffWeighted[0]; 3015 for (int ch = 1; ch < TVectorSize; ch++) 3016 lenSquared = lenSquared + epDiffWeighted[ch] * epDiffWeighted[ch]; 3017 3018 ParallelMath::MakeSafeDenominator(lenSquared); 3019 3020 MFloat maxValueDividedByLengthSquared = ParallelMath::MakeFloat(m_maxValue) / lenSquared; 3021 3022 for (int ch = 0; ch < TVectorSize; ch++) 3023 m_axis[ch] = epDiffWeighted[ch] * channelWeights[ch] * maxValueDividedByLengthSquared; 3024 } 3025 3026 template<bool TSigned> Init(const float channelWeights[TVectorSize],const MUInt15 endPoints[2][TVectorSize],int range)3027 void Init(const float channelWeights[TVectorSize], const MUInt15 endPoints[2][TVectorSize], int range) 3028 { 3029 MAInt16 converted[2][TVectorSize]; 3030 for (int epi = 0; epi < 2; epi++) 3031 for (int ch = 0; ch < TVectorSize; ch++) 3032 converted[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(endPoints[epi][ch]); 3033 3034 Init<MUInt15, MUInt15>(channelWeights, endPoints, endPoints, range); 3035 } 3036 ReconstructLDR_BC7(const MUInt15 & index,MUInt15 * pixel,int numRealChannels)3037 void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel, int numRealChannels) 3038 { 3039 MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9)); 3040 3041 for (int ch = 0; ch < numRealChannels; ch++) 3042 { 3043 MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(64) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch]))); 3044 MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch]))); 3045 pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(32), 6)); 3046 } 3047 } 3048 ReconstructLDRPrecise(const MUInt15 & index,MUInt15 * pixel,int numRealChannels)3049 void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel, int numRealChannels) 3050 { 3051 MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 64, 7)); 3052 3053 for (int ch = 0; ch < numRealChannels; ch++) 3054 { 3055 MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(256) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch]))); 3056 MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch]))); 3057 pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(128), 8)); 3058 } 3059 } 3060 ReconstructLDR_BC7(const MUInt15 & index,MUInt15 * pixel)3061 void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel) 3062 { 3063 ReconstructLDR_BC7(index, pixel, TVectorSize); 3064 } 3065 ReconstructLDRPrecise(const MUInt15 & index,MUInt15 * pixel)3066 void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel) 3067 { 3068 ReconstructLDRPrecise(index, pixel, TVectorSize); 3069 } 3070 SelectIndexLDR(const MFloat * pixel,const ParallelMath::RoundTowardNearestForScope * rtn) const3071 MUInt15 SelectIndexLDR(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const 3072 { 3073 MFloat dist = (pixel[0] - m_origin[0]) * m_axis[0]; 3074 for (int ch = 1; ch < TVectorSize; ch++) 3075 dist = dist + (pixel[ch] - m_origin[ch]) * m_axis[ch]; 3076 3077 return ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(dist, 0.0f, m_maxValue), rtn); 3078 } 3079 3080 protected: 3081 MAInt16 m_endPoint[2][TVectorSize]; 3082 3083 private: 3084 MFloat m_origin[TVectorSize]; 3085 MFloat m_axis[TVectorSize]; 3086 int m_range; 3087 float m_maxValue; 3088 bool m_isUniform; 3089 }; 3090 3091 3092 template<int TVectorSize> 3093 class IndexSelectorHDR : public IndexSelector<TVectorSize> 3094 { 3095 public: 3096 typedef ParallelMath::UInt15 MUInt15; 3097 typedef ParallelMath::UInt16 MUInt16; 3098 typedef ParallelMath::UInt31 MUInt31; 3099 typedef ParallelMath::SInt16 MSInt16; 3100 typedef ParallelMath::SInt32 MSInt32; 3101 typedef ParallelMath::Float MFloat; 3102 3103 private: 3104 InvertSingle(const MUInt15 & anIndex) const3105 MUInt15 InvertSingle(const MUInt15& anIndex) const 3106 { 3107 MUInt15 inverted = m_maxValueMinusOne - anIndex; 3108 return ParallelMath::Select(m_isInverted, inverted, anIndex); 3109 } 3110 ReconstructHDRSignedUninverted(const MUInt15 & index,MSInt16 * pixel) const3111 void ReconstructHDRSignedUninverted(const MUInt15 &index, MSInt16* pixel) const 3112 { 3113 MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9)); 3114 3115 for (int ch = 0; ch < TVectorSize; ch++) 3116 { 3117 MSInt16 ep0 = ParallelMath::LosslessCast<MSInt16>::Cast(this->m_endPoint[0][ch]); 3118 MSInt16 ep1 = ParallelMath::LosslessCast<MSInt16>::Cast(this->m_endPoint[1][ch]); 3119 3120 MSInt32 pixel32 = ParallelMath::XMultiply((ParallelMath::MakeUInt15(64) - weight), ep0) + ParallelMath::XMultiply(weight, ep1); 3121 3122 pixel32 = ParallelMath::RightShift(pixel32 + ParallelMath::MakeSInt32(32), 6); 3123 3124 pixel[ch] = UnscaleHDRValueSigned(ParallelMath::ToSInt16(pixel32)); 3125 } 3126 } 3127 ReconstructHDRUnsignedUninverted(const MUInt15 & index,MSInt16 * pixel) const3128 void ReconstructHDRUnsignedUninverted(const MUInt15 &index, MSInt16* pixel) const 3129 { 3130 MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9)); 3131 3132 for (int ch = 0; ch < TVectorSize; ch++) 3133 { 3134 MUInt16 ep0 = ParallelMath::LosslessCast<MUInt16>::Cast(this->m_endPoint[0][ch]); 3135 MUInt16 ep1 = ParallelMath::LosslessCast<MUInt16>::Cast(this->m_endPoint[1][ch]); 3136 3137 MUInt31 pixel31 = ParallelMath::XMultiply((ParallelMath::MakeUInt15(64) - weight), ep0) + ParallelMath::XMultiply(weight, ep1); 3138 3139 pixel31 = ParallelMath::RightShift(pixel31 + ParallelMath::MakeUInt31(32), 6); 3140 3141 pixel[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::ToUInt16(pixel31))); 3142 } 3143 } 3144 ErrorForInterpolatorComponent(int index,int ch,const MFloat * pixel) const3145 MFloat ErrorForInterpolatorComponent(int index, int ch, const MFloat *pixel) const 3146 { 3147 MFloat diff = pixel[ch] - m_reconstructedInterpolators[index][ch]; 3148 return diff * diff; 3149 } 3150 ErrorForInterpolator(int index,const MFloat * pixel) const3151 MFloat ErrorForInterpolator(int index, const MFloat *pixel) const 3152 { 3153 MFloat error = ErrorForInterpolatorComponent(index, 0, pixel); 3154 for (int ch = 1; ch < TVectorSize; ch++) 3155 error = error + ErrorForInterpolatorComponent(index, ch, pixel); 3156 return error; 3157 } 3158 3159 public: 3160 InitHDR(int range,bool isSigned,bool fastIndexing,const float * channelWeights)3161 void InitHDR(int range, bool isSigned, bool fastIndexing, const float *channelWeights) 3162 { 3163 assert(range <= 16); 3164 3165 m_range = range; 3166 3167 m_isInverted = ParallelMath::MakeBoolInt16(false); 3168 m_maxValueMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(range - 1)); 3169 3170 if (!fastIndexing) 3171 { 3172 for (int i = 0; i < range; i++) 3173 { 3174 MSInt16 recon2CL[TVectorSize]; 3175 3176 if (isSigned) 3177 ReconstructHDRSignedUninverted(ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), recon2CL); 3178 else 3179 ReconstructHDRUnsignedUninverted(ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), recon2CL); 3180 3181 for (int ch = 0; ch < TVectorSize; ch++) 3182 m_reconstructedInterpolators[i][ch] = ParallelMath::TwosCLHalfToFloat(recon2CL[ch]) * channelWeights[ch]; 3183 } 3184 } 3185 } 3186 ReconstructHDRSigned(const MUInt15 & index,MSInt16 * pixel) const3187 void ReconstructHDRSigned(const MUInt15 &index, MSInt16* pixel) const 3188 { 3189 ReconstructHDRSignedUninverted(InvertSingle(index), pixel); 3190 } 3191 ReconstructHDRUnsigned(const MUInt15 & index,MSInt16 * pixel) const3192 void ReconstructHDRUnsigned(const MUInt15 &index, MSInt16* pixel) const 3193 { 3194 ReconstructHDRUnsignedUninverted(InvertSingle(index), pixel); 3195 } 3196 ConditionalInvert(const ParallelMath::Int16CompFlag & invert)3197 void ConditionalInvert(const ParallelMath::Int16CompFlag &invert) 3198 { 3199 m_isInverted = invert; 3200 } 3201 SelectIndexHDRSlow(const MFloat * pixel,const ParallelMath::RoundTowardNearestForScope *) const3202 MUInt15 SelectIndexHDRSlow(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope*) const 3203 { 3204 MUInt15 index = ParallelMath::MakeUInt15(0); 3205 3206 MFloat bestError = ErrorForInterpolator(0, pixel); 3207 for (int i = 1; i < m_range; i++) 3208 { 3209 MFloat error = ErrorForInterpolator(i, pixel); 3210 ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError); 3211 ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(static_cast<uint16_t>(i))); 3212 bestError = ParallelMath::Min(bestError, error); 3213 } 3214 3215 return InvertSingle(index); 3216 } 3217 SelectIndexHDRFast(const MFloat * pixel,const ParallelMath::RoundTowardNearestForScope * rtn) const3218 MUInt15 SelectIndexHDRFast(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const 3219 { 3220 return InvertSingle(this->SelectIndexLDR(pixel, rtn)); 3221 } 3222 3223 private: 3224 MFloat m_reconstructedInterpolators[16][TVectorSize]; 3225 ParallelMath::Int16CompFlag m_isInverted; 3226 MUInt15 m_maxValueMinusOne; 3227 int m_range; 3228 }; 3229 3230 // Solve for a, b where v = a*t + b 3231 // This allows endpoints to be mapped to where T=0 and T=1 3232 // Least squares from totals: 3233 // a = (tv - t*v/w)/(tt - t*t/w) 3234 // b = (v - a*t)/w 3235 template<int TVectorSize> 3236 class EndpointRefiner 3237 { 3238 public: 3239 typedef ParallelMath::Float MFloat; 3240 typedef ParallelMath::UInt16 MUInt16; 3241 typedef ParallelMath::UInt15 MUInt15; 3242 typedef ParallelMath::AInt16 MAInt16; 3243 typedef ParallelMath::SInt16 MSInt16; 3244 typedef ParallelMath::SInt32 MSInt32; 3245 3246 MFloat m_tv[TVectorSize]; 3247 MFloat m_v[TVectorSize]; 3248 MFloat m_tt; 3249 MFloat m_t; 3250 MFloat m_w; 3251 int m_wu; 3252 3253 float m_rcpMaxIndex; 3254 float m_channelWeights[TVectorSize]; 3255 float m_rcpChannelWeights[TVectorSize]; 3256 Init(int indexRange,const float channelWeights[TVectorSize])3257 void Init(int indexRange, const float channelWeights[TVectorSize]) 3258 { 3259 for (int ch = 0; ch < TVectorSize; ch++) 3260 { 3261 m_tv[ch] = ParallelMath::MakeFloatZero(); 3262 m_v[ch] = ParallelMath::MakeFloatZero(); 3263 } 3264 m_tt = ParallelMath::MakeFloatZero(); 3265 m_t = ParallelMath::MakeFloatZero(); 3266 m_w = ParallelMath::MakeFloatZero(); 3267 3268 m_rcpMaxIndex = 1.0f / static_cast<float>(indexRange - 1); 3269 3270 for (int ch = 0; ch < TVectorSize; ch++) 3271 { 3272 m_channelWeights[ch] = channelWeights[ch]; 3273 m_rcpChannelWeights[ch] = 1.0f; 3274 if (m_channelWeights[ch] != 0.0f) 3275 m_rcpChannelWeights[ch] = 1.0f / channelWeights[ch]; 3276 } 3277 3278 m_wu = 0; 3279 } 3280 ContributePW(const MFloat * pwFloatPixel,const MUInt15 & index,const MFloat & weight)3281 void ContributePW(const MFloat *pwFloatPixel, const MUInt15 &index, const MFloat &weight) 3282 { 3283 MFloat t = ParallelMath::ToFloat(index) * m_rcpMaxIndex; 3284 3285 for (int ch = 0; ch < TVectorSize; ch++) 3286 { 3287 MFloat v = pwFloatPixel[ch] * weight; 3288 3289 m_tv[ch] = m_tv[ch] + t * v; 3290 m_v[ch] = m_v[ch] + v; 3291 } 3292 m_tt = m_tt + weight * t * t; 3293 m_t = m_t + weight * t; 3294 m_w = m_w + weight; 3295 } 3296 ContributeUnweightedPW(const MFloat * pwFloatPixel,const MUInt15 & index,int numRealChannels)3297 void ContributeUnweightedPW(const MFloat *pwFloatPixel, const MUInt15 &index, int numRealChannels) 3298 { 3299 MFloat t = ParallelMath::ToFloat(index) * m_rcpMaxIndex; 3300 3301 for (int ch = 0; ch < numRealChannels; ch++) 3302 { 3303 MFloat v = pwFloatPixel[ch]; 3304 3305 m_tv[ch] = m_tv[ch] + t * v; 3306 m_v[ch] = m_v[ch] + v; 3307 } 3308 m_tt = m_tt + t * t; 3309 m_t = m_t + t; 3310 m_wu++; 3311 } 3312 ContributeUnweightedPW(const MFloat * floatPixel,const MUInt15 & index)3313 void ContributeUnweightedPW(const MFloat *floatPixel, const MUInt15 &index) 3314 { 3315 ContributeUnweightedPW(floatPixel, index, TVectorSize); 3316 } 3317 GetRefinedEndpoints(MFloat endPoint[2][TVectorSize])3318 void GetRefinedEndpoints(MFloat endPoint[2][TVectorSize]) 3319 { 3320 // a = (tv - t*v/w)/(tt - t*t/w) 3321 // b = (v - a*t)/w 3322 MFloat w = m_w + ParallelMath::MakeFloat(static_cast<float>(m_wu)); 3323 3324 ParallelMath::MakeSafeDenominator(w); 3325 MFloat wRcp = ParallelMath::Reciprocal(w); 3326 3327 MFloat adenom = (m_tt * w - m_t * m_t) * wRcp; 3328 3329 ParallelMath::FloatCompFlag adenomZero = ParallelMath::Equal(adenom, ParallelMath::MakeFloatZero()); 3330 ParallelMath::ConditionalSet(adenom, adenomZero, ParallelMath::MakeFloat(1.0f)); 3331 3332 for (int ch = 0; ch < TVectorSize; ch++) 3333 { 3334 /* 3335 if (adenom == 0.0) 3336 p1 = p2 = er.v / er.w; 3337 else 3338 { 3339 float4 a = (er.tv - er.t*er.v / er.w) / adenom; 3340 float4 b = (er.v - a * er.t) / er.w; 3341 p1 = b; 3342 p2 = a + b; 3343 } 3344 */ 3345 3346 MFloat a = (m_tv[ch] - m_t * m_v[ch] * wRcp) / adenom; 3347 MFloat b = (m_v[ch] - a * m_t) * wRcp; 3348 3349 MFloat p1 = b; 3350 MFloat p2 = a + b; 3351 3352 ParallelMath::ConditionalSet(p1, adenomZero, (m_v[ch] * wRcp)); 3353 ParallelMath::ConditionalSet(p2, adenomZero, p1); 3354 3355 // Unweight 3356 float inverseWeight = m_rcpChannelWeights[ch]; 3357 3358 endPoint[0][ch] = p1 * inverseWeight; 3359 endPoint[1][ch] = p2 * inverseWeight; 3360 } 3361 } 3362 GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize],int numRealChannels,const ParallelMath::RoundTowardNearestForScope * roundingMode)3363 void GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize], int numRealChannels, const ParallelMath::RoundTowardNearestForScope *roundingMode) 3364 { 3365 MFloat floatEndPoint[2][TVectorSize]; 3366 GetRefinedEndpoints(floatEndPoint); 3367 3368 for (int epi = 0; epi < 2; epi++) 3369 for (int ch = 0; ch < TVectorSize; ch++) 3370 endPoint[epi][ch] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(floatEndPoint[epi][ch], 0.0f, 255.0f), roundingMode); 3371 } 3372 GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize],const ParallelMath::RoundTowardNearestForScope * roundingMode)3373 void GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize], const ParallelMath::RoundTowardNearestForScope *roundingMode) 3374 { 3375 GetRefinedEndpointsLDR(endPoint, TVectorSize, roundingMode); 3376 } 3377 GetRefinedEndpointsHDR(MSInt16 endPoint[2][TVectorSize],bool isSigned,const ParallelMath::RoundTowardNearestForScope * roundingMode)3378 void GetRefinedEndpointsHDR(MSInt16 endPoint[2][TVectorSize], bool isSigned, const ParallelMath::RoundTowardNearestForScope *roundingMode) 3379 { 3380 MFloat floatEndPoint[2][TVectorSize]; 3381 GetRefinedEndpoints(floatEndPoint); 3382 3383 for (int epi = 0; epi < 2; epi++) 3384 { 3385 for (int ch = 0; ch < TVectorSize; ch++) 3386 { 3387 MFloat f = floatEndPoint[epi][ch]; 3388 if (isSigned) 3389 endPoint[epi][ch] = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RoundAndConvertToS16(ParallelMath::Clamp(f, -31743.0f, 31743.0f), roundingMode)); 3390 else 3391 endPoint[epi][ch] = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(f, 0.0f, 31743.0f), roundingMode)); 3392 } 3393 } 3394 } 3395 }; 3396 3397 template<int TVectorSize> 3398 class AggregatedError 3399 { 3400 public: 3401 typedef ParallelMath::UInt16 MUInt16; 3402 typedef ParallelMath::UInt31 MUInt31; 3403 typedef ParallelMath::Float MFloat; 3404 AggregatedError()3405 AggregatedError() 3406 { 3407 for (int ch = 0; ch < TVectorSize; ch++) 3408 m_errorUnweighted[ch] = ParallelMath::MakeUInt31(0); 3409 } 3410 Add(const MUInt16 & channelErrorUnweighted,int ch)3411 void Add(const MUInt16 &channelErrorUnweighted, int ch) 3412 { 3413 m_errorUnweighted[ch] = m_errorUnweighted[ch] + ParallelMath::ToUInt31(channelErrorUnweighted); 3414 } 3415 Finalize(uint32_t flags,const float channelWeightsSq[TVectorSize]) const3416 MFloat Finalize(uint32_t flags, const float channelWeightsSq[TVectorSize]) const 3417 { 3418 if (flags & cvtt::Flags::Uniform) 3419 { 3420 MUInt31 total = m_errorUnweighted[0]; 3421 for (int ch = 1; ch < TVectorSize; ch++) 3422 total = total + m_errorUnweighted[ch]; 3423 return ParallelMath::ToFloat(total); 3424 } 3425 else 3426 { 3427 MFloat total = ParallelMath::ToFloat(m_errorUnweighted[0]) * channelWeightsSq[0]; 3428 for (int ch = 1; ch < TVectorSize; ch++) 3429 total = total + ParallelMath::ToFloat(m_errorUnweighted[ch]) * channelWeightsSq[ch]; 3430 return total; 3431 } 3432 } 3433 3434 private: 3435 MUInt31 m_errorUnweighted[TVectorSize]; 3436 }; 3437 3438 class BCCommon 3439 { 3440 public: 3441 typedef ParallelMath::Float MFloat; 3442 typedef ParallelMath::UInt16 MUInt16; 3443 typedef ParallelMath::UInt15 MUInt15; 3444 typedef ParallelMath::AInt16 MAInt16; 3445 typedef ParallelMath::SInt16 MSInt16; 3446 typedef ParallelMath::SInt32 MSInt32; 3447 TweakRoundsForRange(int range)3448 static int TweakRoundsForRange(int range) 3449 { 3450 if (range == 3) 3451 return 3; 3452 return 4; 3453 } 3454 3455 template<int TVectorSize> ComputeErrorLDR(uint32_t flags,const MUInt15 reconstructed[TVectorSize],const MUInt15 original[TVectorSize],int numRealChannels,AggregatedError<TVectorSize> & aggError)3456 static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, AggregatedError<TVectorSize> &aggError) 3457 { 3458 for (int ch = 0; ch < numRealChannels; ch++) 3459 aggError.Add(ParallelMath::SqDiffUInt8(reconstructed[ch], original[ch]), ch); 3460 } 3461 3462 template<int TVectorSize> ComputeErrorLDR(uint32_t flags,const MUInt15 reconstructed[TVectorSize],const MUInt15 original[TVectorSize],AggregatedError<TVectorSize> & aggError)3463 static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], AggregatedError<TVectorSize> &aggError) 3464 { 3465 ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, TVectorSize, aggError); 3466 } 3467 3468 template<int TVectorSize> ComputeErrorLDRSimple(uint32_t flags,const MUInt15 reconstructed[TVectorSize],const MUInt15 original[TVectorSize],int numRealChannels,const float * channelWeightsSq)3469 static MFloat ComputeErrorLDRSimple(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, const float *channelWeightsSq) 3470 { 3471 AggregatedError<TVectorSize> aggError; 3472 ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, numRealChannels, aggError); 3473 return aggError.Finalize(flags, channelWeightsSq); 3474 } 3475 3476 template<int TVectorSize> ComputeErrorHDRFast(uint32_t flags,const MSInt16 reconstructed[TVectorSize],const MSInt16 original[TVectorSize],const float channelWeightsSq[TVectorSize])3477 static MFloat ComputeErrorHDRFast(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize]) 3478 { 3479 MFloat error = ParallelMath::MakeFloatZero(); 3480 if (flags & Flags::Uniform) 3481 { 3482 for (int ch = 0; ch < TVectorSize; ch++) 3483 error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]); 3484 } 3485 else 3486 { 3487 for (int ch = 0; ch < TVectorSize; ch++) 3488 error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]); 3489 } 3490 3491 return error; 3492 } 3493 3494 template<int TVectorSize> ComputeErrorHDRSlow(uint32_t flags,const MSInt16 reconstructed[TVectorSize],const MSInt16 original[TVectorSize],const float channelWeightsSq[TVectorSize])3495 static MFloat ComputeErrorHDRSlow(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize]) 3496 { 3497 MFloat error = ParallelMath::MakeFloatZero(); 3498 if (flags & Flags::Uniform) 3499 { 3500 for (int ch = 0; ch < TVectorSize; ch++) 3501 error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]); 3502 } 3503 else 3504 { 3505 for (int ch = 0; ch < TVectorSize; ch++) 3506 error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]); 3507 } 3508 3509 return error; 3510 } 3511 3512 template<int TChannelCount> PreWeightPixelsLDR(MFloat preWeightedPixels[16][TChannelCount],const MUInt15 pixels[16][TChannelCount],const float channelWeights[TChannelCount])3513 static void PreWeightPixelsLDR(MFloat preWeightedPixels[16][TChannelCount], const MUInt15 pixels[16][TChannelCount], const float channelWeights[TChannelCount]) 3514 { 3515 for (int px = 0; px < 16; px++) 3516 { 3517 for (int ch = 0; ch < TChannelCount; ch++) 3518 preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch]; 3519 } 3520 } 3521 3522 template<int TChannelCount> PreWeightPixelsHDR(MFloat preWeightedPixels[16][TChannelCount],const MSInt16 pixels[16][TChannelCount],const float channelWeights[TChannelCount])3523 static void PreWeightPixelsHDR(MFloat preWeightedPixels[16][TChannelCount], const MSInt16 pixels[16][TChannelCount], const float channelWeights[TChannelCount]) 3524 { 3525 for (int px = 0; px < 16; px++) 3526 { 3527 for (int ch = 0; ch < TChannelCount; ch++) 3528 preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch]; 3529 } 3530 } 3531 }; 3532 3533 class BC7Computer 3534 { 3535 public: 3536 static const int MaxTweakRounds = 4; 3537 3538 typedef ParallelMath::SInt16 MSInt16; 3539 typedef ParallelMath::UInt15 MUInt15; 3540 typedef ParallelMath::UInt16 MUInt16; 3541 typedef ParallelMath::SInt32 MSInt32; 3542 typedef ParallelMath::Float MFloat; 3543 3544 struct WorkInfo 3545 { 3546 MUInt15 m_mode; 3547 MFloat m_error; 3548 MUInt15 m_ep[3][2][4]; 3549 MUInt15 m_indexes[16]; 3550 MUInt15 m_indexes2[16]; 3551 3552 union 3553 { 3554 MUInt15 m_partition; 3555 struct IndexSelectorAndRotation 3556 { 3557 MUInt15 m_indexSelector; 3558 MUInt15 m_rotation; 3559 } m_isr; 3560 } m_u; 3561 }; 3562 TweakAlpha(const MUInt15 original[2],int tweak,int range,MUInt15 result[2])3563 static void TweakAlpha(const MUInt15 original[2], int tweak, int range, MUInt15 result[2]) 3564 { 3565 ParallelMath::RoundTowardNearestForScope roundingMode; 3566 3567 float tf[2]; 3568 ComputeTweakFactors(tweak, range, tf); 3569 3570 MFloat base = ParallelMath::ToFloat(original[0]); 3571 MFloat offs = ParallelMath::ToFloat(original[1]) - base; 3572 3573 result[0] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[0], 0.0f, 255.0f), &roundingMode); 3574 result[1] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[1], 0.0f, 255.0f), &roundingMode); 3575 } 3576 Quantize(MUInt15 * color,int bits,int channels,const ParallelMath::RoundTowardNearestForScope * roundingMode)3577 static void Quantize(MUInt15* color, int bits, int channels, const ParallelMath::RoundTowardNearestForScope *roundingMode) 3578 { 3579 float maxColor = static_cast<float>((1 << bits) - 1); 3580 3581 for (int i = 0; i < channels; i++) 3582 color[i] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(ParallelMath::ToFloat(color[i]) * ParallelMath::MakeFloat(1.0f / 255.0f) * maxColor, 0.f, 255.f), roundingMode); 3583 } 3584 QuantizeP(MUInt15 * color,int bits,uint16_t p,int channels,const ParallelMath::RoundTowardNearestForScope * roundingMode)3585 static void QuantizeP(MUInt15* color, int bits, uint16_t p, int channels, const ParallelMath::RoundTowardNearestForScope *roundingMode) 3586 { 3587 uint16_t pShift = static_cast<uint16_t>(1 << (7 - bits)); 3588 MUInt15 pShiftV = ParallelMath::MakeUInt15(pShift); 3589 3590 float maxColorF = static_cast<float>(255 - (1 << (7 - bits))); 3591 3592 float maxQuantized = static_cast<float>((1 << bits) - 1); 3593 3594 for (int ch = 0; ch < channels; ch++) 3595 { 3596 MUInt15 clr = color[ch]; 3597 if (p) 3598 clr = ParallelMath::Max(clr, pShiftV) - pShiftV; 3599 3600 MFloat rerangedColor = ParallelMath::ToFloat(clr) * maxQuantized / maxColorF; 3601 3602 clr = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(rerangedColor, 0.0f, maxQuantized), roundingMode) << 1; 3603 if (p) 3604 clr = clr | ParallelMath::MakeUInt15(1); 3605 3606 color[ch] = clr; 3607 } 3608 } 3609 Unquantize(MUInt15 * color,int bits,int channels)3610 static void Unquantize(MUInt15* color, int bits, int channels) 3611 { 3612 for (int ch = 0; ch < channels; ch++) 3613 { 3614 MUInt15 clr = color[ch]; 3615 clr = clr << (8 - bits); 3616 color[ch] = clr | ParallelMath::RightShift(clr, bits); 3617 } 3618 } 3619 CompressEndpoints0(MUInt15 ep[2][4],uint16_t p[2],const ParallelMath::RoundTowardNearestForScope * roundingMode)3620 static void CompressEndpoints0(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode) 3621 { 3622 for (int j = 0; j < 2; j++) 3623 { 3624 QuantizeP(ep[j], 4, p[j], 3, roundingMode); 3625 Unquantize(ep[j], 5, 3); 3626 ep[j][3] = ParallelMath::MakeUInt15(255); 3627 } 3628 } 3629 CompressEndpoints1(MUInt15 ep[2][4],uint16_t p,const ParallelMath::RoundTowardNearestForScope * roundingMode)3630 static void CompressEndpoints1(MUInt15 ep[2][4], uint16_t p, const ParallelMath::RoundTowardNearestForScope *roundingMode) 3631 { 3632 for (int j = 0; j < 2; j++) 3633 { 3634 QuantizeP(ep[j], 6, p, 3, roundingMode); 3635 Unquantize(ep[j], 7, 3); 3636 ep[j][3] = ParallelMath::MakeUInt15(255); 3637 } 3638 } 3639 CompressEndpoints2(MUInt15 ep[2][4],const ParallelMath::RoundTowardNearestForScope * roundingMode)3640 static void CompressEndpoints2(MUInt15 ep[2][4], const ParallelMath::RoundTowardNearestForScope *roundingMode) 3641 { 3642 for (int j = 0; j < 2; j++) 3643 { 3644 Quantize(ep[j], 5, 3, roundingMode); 3645 Unquantize(ep[j], 5, 3); 3646 ep[j][3] = ParallelMath::MakeUInt15(255); 3647 } 3648 } 3649 CompressEndpoints3(MUInt15 ep[2][4],uint16_t p[2],const ParallelMath::RoundTowardNearestForScope * roundingMode)3650 static void CompressEndpoints3(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode) 3651 { 3652 for (int j = 0; j < 2; j++) 3653 { 3654 QuantizeP(ep[j], 7, p[j], 3, roundingMode); 3655 ep[j][3] = ParallelMath::MakeUInt15(255); 3656 } 3657 } 3658 CompressEndpoints4(MUInt15 epRGB[2][3],MUInt15 epA[2],const ParallelMath::RoundTowardNearestForScope * roundingMode)3659 static void CompressEndpoints4(MUInt15 epRGB[2][3], MUInt15 epA[2], const ParallelMath::RoundTowardNearestForScope *roundingMode) 3660 { 3661 for (int j = 0; j < 2; j++) 3662 { 3663 Quantize(epRGB[j], 5, 3, roundingMode); 3664 Unquantize(epRGB[j], 5, 3); 3665 3666 Quantize(epA + j, 6, 1, roundingMode); 3667 Unquantize(epA + j, 6, 1); 3668 } 3669 } 3670 CompressEndpoints5(MUInt15 epRGB[2][3],MUInt15 epA[2],const ParallelMath::RoundTowardNearestForScope * roundingMode)3671 static void CompressEndpoints5(MUInt15 epRGB[2][3], MUInt15 epA[2], const ParallelMath::RoundTowardNearestForScope *roundingMode) 3672 { 3673 for (int j = 0; j < 2; j++) 3674 { 3675 Quantize(epRGB[j], 7, 3, roundingMode); 3676 Unquantize(epRGB[j], 7, 3); 3677 } 3678 3679 // Alpha is full precision 3680 (void)epA; 3681 } 3682 CompressEndpoints6(MUInt15 ep[2][4],uint16_t p[2],const ParallelMath::RoundTowardNearestForScope * roundingMode)3683 static void CompressEndpoints6(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode) 3684 { 3685 for (int j = 0; j < 2; j++) 3686 QuantizeP(ep[j], 7, p[j], 4, roundingMode); 3687 } 3688 CompressEndpoints7(MUInt15 ep[2][4],uint16_t p[2],const ParallelMath::RoundTowardNearestForScope * roundingMode)3689 static void CompressEndpoints7(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode) 3690 { 3691 for (int j = 0; j < 2; j++) 3692 { 3693 QuantizeP(ep[j], 5, p[j], 4, roundingMode); 3694 Unquantize(ep[j], 6, 4); 3695 } 3696 } 3697 3698 struct SinglePlaneTemporaries 3699 { 3700 UnfinishedEndpoints<3> unfinishedRGB[BC7Data::g_numShapesAll]; 3701 UnfinishedEndpoints<4> unfinishedRGBA[BC7Data::g_numShapes12]; 3702 3703 MUInt15 fragmentBestIndexes[BC7Data::g_numFragments]; 3704 MUInt15 shapeBestEP[BC7Data::g_maxFragmentsPerMode][2][4]; 3705 MFloat shapeBestError[BC7Data::g_maxFragmentsPerMode]; 3706 }; 3707 TrySingleColorRGBAMultiTable(uint32_t flags,const MUInt15 pixels[16][4],const MFloat average[4],int numRealChannels,const uint8_t * fragmentStart,int shapeLength,const MFloat & staticAlphaError,const ParallelMath::Int16CompFlag punchThroughInvalid[4],MFloat & shapeBestError,MUInt15 shapeBestEP[2][4],MUInt15 * fragmentBestIndexes,const float * channelWeightsSq,const cvtt::Tables::BC7SC::Table * const * tables,int numTables,const ParallelMath::RoundTowardNearestForScope * rtn)3708 static void TrySingleColorRGBAMultiTable(uint32_t flags, const MUInt15 pixels[16][4], const MFloat average[4], int numRealChannels, const uint8_t *fragmentStart, int shapeLength, const MFloat &staticAlphaError, const ParallelMath::Int16CompFlag punchThroughInvalid[4], MFloat& shapeBestError, MUInt15 shapeBestEP[2][4], MUInt15 *fragmentBestIndexes, const float *channelWeightsSq, const cvtt::Tables::BC7SC::Table*const* tables, int numTables, const ParallelMath::RoundTowardNearestForScope *rtn) 3709 { 3710 MFloat bestAverageError = ParallelMath::MakeFloat(FLT_MAX); 3711 3712 MUInt15 intAverage[4]; 3713 for (int ch = 0; ch < 4; ch++) 3714 intAverage[ch] = ParallelMath::RoundAndConvertToU15(average[ch], rtn); 3715 3716 MUInt15 eps[2][4]; 3717 MUInt15 reconstructed[4]; 3718 MUInt15 index = ParallelMath::MakeUInt15(0); 3719 3720 for (int epi = 0; epi < 2; epi++) 3721 { 3722 for (int ch = 0; ch < 3; ch++) 3723 eps[epi][ch] = ParallelMath::MakeUInt15(0); 3724 eps[epi][3] = ParallelMath::MakeUInt15(255); 3725 } 3726 3727 for (int ch = 0; ch < 3; ch++) 3728 reconstructed[ch] = ParallelMath::MakeUInt15(0); 3729 reconstructed[3] = ParallelMath::MakeUInt15(255); 3730 3731 // Depending on the target index and parity bits, there are multiple valid solid colors. 3732 // We want to find the one closest to the actual average. 3733 MFloat epsAverageDiff = ParallelMath::MakeFloat(FLT_MAX); 3734 for (int t = 0; t < numTables; t++) 3735 { 3736 const cvtt::Tables::BC7SC::Table& table = *(tables[t]); 3737 3738 ParallelMath::Int16CompFlag pti = punchThroughInvalid[table.m_pBits]; 3739 3740 MUInt15 candidateReconstructed[4]; 3741 MUInt15 candidateEPs[2][4]; 3742 3743 for (int i = 0; i < ParallelMath::ParallelSize; i++) 3744 { 3745 for (int ch = 0; ch < numRealChannels; ch++) 3746 { 3747 ParallelMath::ScalarUInt16 avgValue = ParallelMath::Extract(intAverage[ch], i); 3748 assert(avgValue >= 0 && avgValue <= 255); 3749 3750 const cvtt::Tables::BC7SC::TableEntry &entry = table.m_entries[avgValue]; 3751 3752 ParallelMath::PutUInt15(candidateEPs[0][ch], i, entry.m_min); 3753 ParallelMath::PutUInt15(candidateEPs[1][ch], i, entry.m_max); 3754 ParallelMath::PutUInt15(candidateReconstructed[ch], i, entry.m_actualColor); 3755 } 3756 } 3757 3758 MFloat avgError = ParallelMath::MakeFloatZero(); 3759 for (int ch = 0; ch < numRealChannels; ch++) 3760 { 3761 MFloat delta = ParallelMath::ToFloat(candidateReconstructed[ch]) - average[ch]; 3762 avgError = avgError + delta * delta * channelWeightsSq[ch]; 3763 } 3764 3765 ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(avgError, bestAverageError)); 3766 better = ParallelMath::AndNot(pti, better); // Mask out punch-through invalidations 3767 3768 if (ParallelMath::AnySet(better)) 3769 { 3770 ParallelMath::ConditionalSet(bestAverageError, ParallelMath::Int16FlagToFloat(better), avgError); 3771 3772 MUInt15 candidateIndex = ParallelMath::MakeUInt15(table.m_index); 3773 3774 ParallelMath::ConditionalSet(index, better, candidateIndex); 3775 3776 for (int ch = 0; ch < numRealChannels; ch++) 3777 ParallelMath::ConditionalSet(reconstructed[ch], better, candidateReconstructed[ch]); 3778 3779 for (int epi = 0; epi < 2; epi++) 3780 for (int ch = 0; ch < numRealChannels; ch++) 3781 ParallelMath::ConditionalSet(eps[epi][ch], better, candidateEPs[epi][ch]); 3782 } 3783 } 3784 3785 AggregatedError<4> aggError; 3786 for (int pxi = 0; pxi < shapeLength; pxi++) 3787 { 3788 int px = fragmentStart[pxi]; 3789 3790 BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError); 3791 } 3792 3793 MFloat error = aggError.Finalize(flags, channelWeightsSq) + staticAlphaError; 3794 3795 ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, shapeBestError)); 3796 if (ParallelMath::AnySet(better)) 3797 { 3798 shapeBestError = ParallelMath::Min(shapeBestError, error); 3799 for (int epi = 0; epi < 2; epi++) 3800 { 3801 for (int ch = 0; ch < numRealChannels; ch++) 3802 ParallelMath::ConditionalSet(shapeBestEP[epi][ch], better, eps[epi][ch]); 3803 } 3804 3805 for (int pxi = 0; pxi < shapeLength; pxi++) 3806 ParallelMath::ConditionalSet(fragmentBestIndexes[pxi], better, index); 3807 } 3808 } 3809 3810 TrySinglePlane(uint32_t flags,const MUInt15 pixels[16][4],const MFloat floatPixels[16][4],const float channelWeights[4],int numTweakRounds,int numRefineRounds,WorkInfo & work,const ParallelMath::RoundTowardNearestForScope * rtn)3811 static void TrySinglePlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], int numTweakRounds, int numRefineRounds, WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn) 3812 { 3813 if (numRefineRounds < 1) 3814 numRefineRounds = 1; 3815 3816 if (numTweakRounds < 1) 3817 numTweakRounds = 1; 3818 else if (numTweakRounds > MaxTweakRounds) 3819 numTweakRounds = MaxTweakRounds; 3820 3821 float channelWeightsSq[4]; 3822 3823 for (int ch = 0; ch < 4; ch++) 3824 channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch]; 3825 3826 SinglePlaneTemporaries temps; 3827 3828 MUInt15 maxAlpha = ParallelMath::MakeUInt15(0); 3829 MUInt15 minAlpha = ParallelMath::MakeUInt15(255); 3830 ParallelMath::Int16CompFlag isPunchThrough = ParallelMath::MakeBoolInt16(true); 3831 for (int px = 0; px < 16; px++) 3832 { 3833 MUInt15 a = pixels[px][3]; 3834 maxAlpha = ParallelMath::Max(maxAlpha, a); 3835 minAlpha = ParallelMath::Min(minAlpha, a); 3836 3837 isPunchThrough = (isPunchThrough & (ParallelMath::Equal(a, ParallelMath::MakeUInt15(0)) | ParallelMath::Equal(a, ParallelMath::MakeUInt15(255)))); 3838 } 3839 3840 ParallelMath::Int16CompFlag blockHasNonMaxAlpha = ParallelMath::Less(minAlpha, ParallelMath::MakeUInt15(255)); 3841 ParallelMath::Int16CompFlag blockHasNonZeroAlpha = ParallelMath::Less(ParallelMath::MakeUInt15(0), maxAlpha); 3842 3843 bool anyBlockHasAlpha = ParallelMath::AnySet(blockHasNonMaxAlpha); 3844 3845 // Try RGB modes if any block has a min alpha 251 or higher 3846 bool allowRGBModes = ParallelMath::AnySet(ParallelMath::Less(ParallelMath::MakeUInt15(250), minAlpha)); 3847 3848 // Try mode 7 if any block has alpha. 3849 // Mode 7 is almost never selected for RGB blocks because mode 4 has very accurate 7.7.7.1 endpoints 3850 // and its parity bit doesn't affect alpha, meaning mode 7 can only be better in extremely specific 3851 // situations, and only by at most 1 unit of error per pixel. 3852 bool allowMode7 = anyBlockHasAlpha; 3853 3854 MFloat preWeightedPixels[16][4]; 3855 3856 BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights); 3857 3858 const int *rgbInitialEPCollapseList = NULL; 3859 3860 // Get initial RGB endpoints 3861 if (allowRGBModes) 3862 { 3863 const int *shapeList; 3864 int numShapesToEvaluate; 3865 3866 if (flags & Flags::BC7_EnablePartitioning) 3867 { 3868 if (flags & Flags::BC7_Enable3Subsets) 3869 { 3870 shapeList = BC7Data::g_shapeListAll; 3871 rgbInitialEPCollapseList = BC7Data::g_shapeListAll; 3872 numShapesToEvaluate = BC7Data::g_numShapesAll; 3873 } 3874 else 3875 { 3876 shapeList = BC7Data::g_shapeList12; 3877 rgbInitialEPCollapseList = BC7Data::g_shapeList12Collapse; 3878 numShapesToEvaluate = BC7Data::g_numShapes12; 3879 } 3880 } 3881 else 3882 { 3883 shapeList = BC7Data::g_shapeList1; 3884 rgbInitialEPCollapseList = BC7Data::g_shapeList1Collapse; 3885 numShapesToEvaluate = BC7Data::g_numShapes1; 3886 } 3887 3888 for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++) 3889 { 3890 int shape = shapeList[shapeIter]; 3891 3892 int shapeStart = BC7Data::g_shapeRanges[shape][0]; 3893 int shapeSize = BC7Data::g_shapeRanges[shape][1]; 3894 3895 EndpointSelector<3, 8> epSelector; 3896 3897 for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++) 3898 { 3899 for (int spx = 0; spx < shapeSize; spx++) 3900 { 3901 int px = BC7Data::g_fragments[shapeStart + spx]; 3902 epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f)); 3903 } 3904 epSelector.FinishPass(epPass); 3905 } 3906 temps.unfinishedRGB[shapeIter] = epSelector.GetEndpoints(channelWeights); 3907 } 3908 } 3909 3910 const int *rgbaInitialEPCollapseList = BC7Data::g_shapeList12Collapse; 3911 3912 // Get initial RGBA endpoints 3913 { 3914 const int *shapeList = BC7Data::g_shapeList12; 3915 int numShapesToEvaluate = BC7Data::g_numShapes12; 3916 3917 for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++) 3918 { 3919 int shape = shapeList[shapeIter]; 3920 3921 if (anyBlockHasAlpha || !allowRGBModes) 3922 { 3923 int shapeStart = BC7Data::g_shapeRanges[shape][0]; 3924 int shapeSize = BC7Data::g_shapeRanges[shape][1]; 3925 3926 EndpointSelector<4, 8> epSelector; 3927 3928 for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++) 3929 { 3930 for (int spx = 0; spx < shapeSize; spx++) 3931 { 3932 int px = BC7Data::g_fragments[shapeStart + spx]; 3933 epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f)); 3934 } 3935 epSelector.FinishPass(epPass); 3936 } 3937 temps.unfinishedRGBA[shapeIter] = epSelector.GetEndpoints(channelWeights); 3938 } 3939 else 3940 { 3941 temps.unfinishedRGBA[shapeIter] = temps.unfinishedRGB[rgbInitialEPCollapseList[shape]].ExpandTo<4>(255); 3942 } 3943 } 3944 } 3945 3946 for (uint16_t mode = 0; mode <= 7; mode++) 3947 { 3948 if (!(flags & Flags::BC7_EnablePartitioning) && BC7Data::g_modes[mode].m_numSubsets != 1) 3949 continue; 3950 3951 if (!(flags & Flags::BC7_Enable3Subsets) && BC7Data::g_modes[mode].m_numSubsets == 3) 3952 continue; 3953 3954 if (mode == 4 || mode == 5) 3955 continue; 3956 3957 if (mode < 4 && !allowRGBModes) 3958 continue; 3959 3960 if (mode == 7 && !allowMode7) 3961 continue; 3962 3963 bool isRGB = (mode < 4); 3964 3965 unsigned int numPartitions = 1 << BC7Data::g_modes[mode].m_partitionBits; 3966 int numSubsets = BC7Data::g_modes[mode].m_numSubsets; 3967 int indexPrec = BC7Data::g_modes[mode].m_indexBits; 3968 3969 int parityBitMax = 1; 3970 if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerEndpoint) 3971 parityBitMax = 4; 3972 else if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerSubset) 3973 parityBitMax = 2; 3974 3975 int numRealChannels = isRGB ? 3 : 4; 3976 3977 int numShapes; 3978 const int *shapeList; 3979 const int *shapeCollapseList; 3980 3981 if (numSubsets == 1) 3982 { 3983 numShapes = BC7Data::g_numShapes1; 3984 shapeList = BC7Data::g_shapeList1; 3985 shapeCollapseList = BC7Data::g_shapeList1Collapse; 3986 } 3987 else if (numSubsets == 2) 3988 { 3989 numShapes = BC7Data::g_numShapes2; 3990 shapeList = BC7Data::g_shapeList2; 3991 shapeCollapseList = BC7Data::g_shapeList2Collapse; 3992 } 3993 else 3994 { 3995 assert(numSubsets == 3); 3996 if (numPartitions == 16) 3997 { 3998 numShapes = BC7Data::g_numShapes3Short; 3999 shapeList = BC7Data::g_shapeList3Short; 4000 shapeCollapseList = BC7Data::g_shapeList3ShortCollapse; 4001 } 4002 else 4003 { 4004 assert(numPartitions == 64); 4005 numShapes = BC7Data::g_numShapes3; 4006 shapeList = BC7Data::g_shapeList3; 4007 shapeCollapseList = BC7Data::g_shapeList3Collapse; 4008 } 4009 } 4010 4011 for (int slot = 0; slot < BC7Data::g_maxFragmentsPerMode; slot++) 4012 temps.shapeBestError[slot] = ParallelMath::MakeFloat(FLT_MAX); 4013 4014 for (int shapeIter = 0; shapeIter < numShapes; shapeIter++) 4015 { 4016 int shape = shapeList[shapeIter]; 4017 int shapeStart = BC7Data::g_shapeRanges[shape][0]; 4018 int shapeLength = BC7Data::g_shapeRanges[shape][1]; 4019 int shapeCollapsedEvalIndex = shapeCollapseList[shape]; 4020 4021 AggregatedError<1> alphaAggError; 4022 if (isRGB && anyBlockHasAlpha) 4023 { 4024 MUInt15 filledAlpha[1] = { ParallelMath::MakeUInt15(255) }; 4025 4026 for (int pxi = 0; pxi < shapeLength; pxi++) 4027 { 4028 int px = BC7Data::g_fragments[shapeStart + pxi]; 4029 MUInt15 original[1] = { pixels[px][3] }; 4030 BCCommon::ComputeErrorLDR<1>(flags, filledAlpha, original, alphaAggError); 4031 } 4032 } 4033 4034 float alphaWeightsSq[1] = { channelWeightsSq[3] }; 4035 MFloat staticAlphaError = alphaAggError.Finalize(flags, alphaWeightsSq); 4036 4037 assert(shapeCollapsedEvalIndex >= 0); 4038 4039 MUInt15 tweakBaseEP[MaxTweakRounds][2][4]; 4040 4041 for (int tweak = 0; tweak < numTweakRounds; tweak++) 4042 { 4043 if (isRGB) 4044 { 4045 temps.unfinishedRGB[rgbInitialEPCollapseList[shape]].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]); 4046 tweakBaseEP[tweak][0][3] = tweakBaseEP[tweak][1][3] = ParallelMath::MakeUInt15(255); 4047 } 4048 else 4049 { 4050 temps.unfinishedRGBA[rgbaInitialEPCollapseList[shape]].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]); 4051 } 4052 } 4053 4054 ParallelMath::Int16CompFlag punchThroughInvalid[4]; 4055 for (int pIter = 0; pIter < parityBitMax; pIter++) 4056 { 4057 punchThroughInvalid[pIter] = ParallelMath::MakeBoolInt16(false); 4058 4059 if ((flags & Flags::BC7_RespectPunchThrough) && (mode == 6 || mode == 7)) 4060 { 4061 // Modes 6 and 7 have parity bits that affect alpha 4062 if (pIter == 0) 4063 punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonZeroAlpha); 4064 else if (pIter == parityBitMax - 1) 4065 punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonMaxAlpha); 4066 else 4067 punchThroughInvalid[pIter] = isPunchThrough; 4068 } 4069 } 4070 4071 for (int pIter = 0; pIter < parityBitMax; pIter++) 4072 { 4073 if (ParallelMath::AllSet(punchThroughInvalid[pIter])) 4074 continue; 4075 4076 bool needPunchThroughCheck = ParallelMath::AnySet(punchThroughInvalid[pIter]); 4077 4078 for (int tweak = 0; tweak < numTweakRounds; tweak++) 4079 { 4080 uint16_t p[2]; 4081 p[0] = (pIter & 1); 4082 p[1] = ((pIter >> 1) & 1); 4083 4084 MUInt15 ep[2][4]; 4085 4086 for (int epi = 0; epi < 2; epi++) 4087 for (int ch = 0; ch < 4; ch++) 4088 ep[epi][ch] = tweakBaseEP[tweak][epi][ch]; 4089 4090 for (int refine = 0; refine < numRefineRounds; refine++) 4091 { 4092 switch (mode) 4093 { 4094 case 0: 4095 CompressEndpoints0(ep, p, rtn); 4096 break; 4097 case 1: 4098 CompressEndpoints1(ep, p[0], rtn); 4099 break; 4100 case 2: 4101 CompressEndpoints2(ep, rtn); 4102 break; 4103 case 3: 4104 CompressEndpoints3(ep, p, rtn); 4105 break; 4106 case 6: 4107 CompressEndpoints6(ep, p, rtn); 4108 break; 4109 case 7: 4110 CompressEndpoints7(ep, p, rtn); 4111 break; 4112 default: 4113 assert(false); 4114 break; 4115 }; 4116 4117 MFloat shapeError = ParallelMath::MakeFloatZero(); 4118 4119 IndexSelector<4> indexSelector; 4120 indexSelector.Init<false>(channelWeights, ep, 1 << indexPrec); 4121 4122 EndpointRefiner<4> epRefiner; 4123 epRefiner.Init(1 << indexPrec, channelWeights); 4124 4125 MUInt15 indexes[16]; 4126 4127 AggregatedError<4> aggError; 4128 for (int pxi = 0; pxi < shapeLength; pxi++) 4129 { 4130 int px = BC7Data::g_fragments[shapeStart + pxi]; 4131 4132 MUInt15 index; 4133 MUInt15 reconstructed[4]; 4134 4135 index = indexSelector.SelectIndexLDR(floatPixels[px], rtn); 4136 indexSelector.ReconstructLDR_BC7(index, reconstructed, numRealChannels); 4137 4138 if (flags & cvtt::Flags::BC7_FastIndexing) 4139 BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError); 4140 else 4141 { 4142 MFloat error = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq); 4143 4144 MUInt15 altIndexes[2]; 4145 altIndexes[0] = ParallelMath::Max(index, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1); 4146 altIndexes[1] = ParallelMath::Min(index + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << indexPrec) - 1))); 4147 4148 for (int ii = 0; ii < 2; ii++) 4149 { 4150 indexSelector.ReconstructLDR_BC7(altIndexes[ii], reconstructed, numRealChannels); 4151 4152 MFloat altError = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq); 4153 ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altError, error)); 4154 error = ParallelMath::Min(error, altError); 4155 ParallelMath::ConditionalSet(index, better, altIndexes[ii]); 4156 } 4157 4158 shapeError = shapeError + error; 4159 } 4160 4161 if (refine != numRefineRounds - 1) 4162 epRefiner.ContributeUnweightedPW(preWeightedPixels[px], index, numRealChannels); 4163 4164 indexes[pxi] = index; 4165 } 4166 4167 if (flags & cvtt::Flags::BC7_FastIndexing) 4168 shapeError = aggError.Finalize(flags, channelWeightsSq); 4169 4170 if (isRGB) 4171 shapeError = shapeError + staticAlphaError; 4172 4173 ParallelMath::FloatCompFlag shapeErrorBetter; 4174 ParallelMath::Int16CompFlag shapeErrorBetter16; 4175 4176 shapeErrorBetter = ParallelMath::Less(shapeError, temps.shapeBestError[shapeCollapsedEvalIndex]); 4177 shapeErrorBetter16 = ParallelMath::FloatFlagToInt16(shapeErrorBetter); 4178 4179 if (ParallelMath::AnySet(shapeErrorBetter16)) 4180 { 4181 bool punchThroughOK = true; 4182 if (needPunchThroughCheck) 4183 { 4184 shapeErrorBetter16 = ParallelMath::AndNot(punchThroughInvalid[pIter], shapeErrorBetter16); 4185 shapeErrorBetter = ParallelMath::Int16FlagToFloat(shapeErrorBetter16); 4186 4187 if (!ParallelMath::AnySet(shapeErrorBetter16)) 4188 punchThroughOK = false; 4189 } 4190 4191 if (punchThroughOK) 4192 { 4193 ParallelMath::ConditionalSet(temps.shapeBestError[shapeCollapsedEvalIndex], shapeErrorBetter, shapeError); 4194 for (int epi = 0; epi < 2; epi++) 4195 for (int ch = 0; ch < numRealChannels; ch++) 4196 ParallelMath::ConditionalSet(temps.shapeBestEP[shapeCollapsedEvalIndex][epi][ch], shapeErrorBetter16, ep[epi][ch]); 4197 4198 for (int pxi = 0; pxi < shapeLength; pxi++) 4199 ParallelMath::ConditionalSet(temps.fragmentBestIndexes[shapeStart + pxi], shapeErrorBetter16, indexes[pxi]); 4200 } 4201 } 4202 4203 if (refine != numRefineRounds - 1) 4204 epRefiner.GetRefinedEndpointsLDR(ep, numRealChannels, rtn); 4205 } // refine 4206 } // tweak 4207 } // p 4208 4209 if (flags & cvtt::Flags::BC7_TrySingleColor) 4210 { 4211 MUInt15 total[4]; 4212 for (int ch = 0; ch < 4; ch++) 4213 total[ch] = ParallelMath::MakeUInt15(0); 4214 4215 for (int pxi = 0; pxi < shapeLength; pxi++) 4216 { 4217 int px = BC7Data::g_fragments[shapeStart + pxi]; 4218 for (int ch = 0; ch < 4; ch++) 4219 total[ch] = total[ch] + pixels[pxi][ch]; 4220 } 4221 4222 MFloat rcpShapeLength = ParallelMath::MakeFloat(1.0f / static_cast<float>(shapeLength)); 4223 MFloat average[4]; 4224 for (int ch = 0; ch < 4; ch++) 4225 average[ch] = ParallelMath::ToFloat(total[ch]) * rcpShapeLength; 4226 4227 const uint8_t *fragment = BC7Data::g_fragments + shapeStart; 4228 MFloat &shapeBestError = temps.shapeBestError[shapeCollapsedEvalIndex]; 4229 MUInt15(&shapeBestEP)[2][4] = temps.shapeBestEP[shapeCollapsedEvalIndex]; 4230 MUInt15 *fragmentBestIndexes = temps.fragmentBestIndexes + shapeStart; 4231 4232 const cvtt::Tables::BC7SC::Table **scTables = NULL; 4233 int numSCTables = 0; 4234 4235 switch (mode) 4236 { 4237 case 0: 4238 { 4239 const cvtt::Tables::BC7SC::Table *tables[] = 4240 { 4241 &cvtt::Tables::BC7SC::g_mode0_p00_i1, 4242 &cvtt::Tables::BC7SC::g_mode0_p00_i2, 4243 &cvtt::Tables::BC7SC::g_mode0_p00_i3, 4244 &cvtt::Tables::BC7SC::g_mode0_p01_i1, 4245 &cvtt::Tables::BC7SC::g_mode0_p01_i2, 4246 &cvtt::Tables::BC7SC::g_mode0_p01_i3, 4247 &cvtt::Tables::BC7SC::g_mode0_p10_i1, 4248 &cvtt::Tables::BC7SC::g_mode0_p10_i2, 4249 &cvtt::Tables::BC7SC::g_mode0_p10_i3, 4250 &cvtt::Tables::BC7SC::g_mode0_p11_i1, 4251 &cvtt::Tables::BC7SC::g_mode0_p11_i2, 4252 &cvtt::Tables::BC7SC::g_mode0_p11_i3, 4253 }; 4254 scTables = tables; 4255 numSCTables = sizeof(tables) / sizeof(tables[0]); 4256 } 4257 break; 4258 case 1: 4259 { 4260 const cvtt::Tables::BC7SC::Table *tables[] = 4261 { 4262 &cvtt::Tables::BC7SC::g_mode1_p0_i1, 4263 &cvtt::Tables::BC7SC::g_mode1_p0_i2, 4264 &cvtt::Tables::BC7SC::g_mode1_p0_i3, 4265 &cvtt::Tables::BC7SC::g_mode1_p1_i1, 4266 &cvtt::Tables::BC7SC::g_mode1_p1_i2, 4267 &cvtt::Tables::BC7SC::g_mode1_p1_i3, 4268 }; 4269 scTables = tables; 4270 numSCTables = sizeof(tables) / sizeof(tables[0]); 4271 } 4272 break; 4273 case 2: 4274 { 4275 const cvtt::Tables::BC7SC::Table *tables[] = 4276 { 4277 &cvtt::Tables::BC7SC::g_mode2, 4278 }; 4279 scTables = tables; 4280 numSCTables = sizeof(tables) / sizeof(tables[0]); 4281 } 4282 break; 4283 case 3: 4284 { 4285 const cvtt::Tables::BC7SC::Table *tables[] = 4286 { 4287 &cvtt::Tables::BC7SC::g_mode3_p0, 4288 &cvtt::Tables::BC7SC::g_mode3_p1, 4289 }; 4290 scTables = tables; 4291 numSCTables = sizeof(tables) / sizeof(tables[0]); 4292 } 4293 break; 4294 case 6: 4295 { 4296 const cvtt::Tables::BC7SC::Table *tables[] = 4297 { 4298 &cvtt::Tables::BC7SC::g_mode6_p0_i1, 4299 &cvtt::Tables::BC7SC::g_mode6_p0_i2, 4300 &cvtt::Tables::BC7SC::g_mode6_p0_i3, 4301 &cvtt::Tables::BC7SC::g_mode6_p0_i4, 4302 &cvtt::Tables::BC7SC::g_mode6_p0_i5, 4303 &cvtt::Tables::BC7SC::g_mode6_p0_i6, 4304 &cvtt::Tables::BC7SC::g_mode6_p0_i7, 4305 &cvtt::Tables::BC7SC::g_mode6_p1_i1, 4306 &cvtt::Tables::BC7SC::g_mode6_p1_i2, 4307 &cvtt::Tables::BC7SC::g_mode6_p1_i3, 4308 &cvtt::Tables::BC7SC::g_mode6_p1_i4, 4309 &cvtt::Tables::BC7SC::g_mode6_p1_i5, 4310 &cvtt::Tables::BC7SC::g_mode6_p1_i6, 4311 &cvtt::Tables::BC7SC::g_mode6_p1_i7, 4312 }; 4313 scTables = tables; 4314 numSCTables = sizeof(tables) / sizeof(tables[0]); 4315 } 4316 break; 4317 case 7: 4318 { 4319 const cvtt::Tables::BC7SC::Table *tables[] = 4320 { 4321 &cvtt::Tables::BC7SC::g_mode7_p00, 4322 &cvtt::Tables::BC7SC::g_mode7_p01, 4323 &cvtt::Tables::BC7SC::g_mode7_p10, 4324 &cvtt::Tables::BC7SC::g_mode7_p11, 4325 }; 4326 scTables = tables; 4327 numSCTables = sizeof(tables) / sizeof(tables[0]); 4328 } 4329 break; 4330 default: 4331 assert(false); 4332 break; 4333 } 4334 4335 TrySingleColorRGBAMultiTable(flags, pixels, average, numRealChannels, fragment, shapeLength, staticAlphaError, punchThroughInvalid, shapeBestError, shapeBestEP, fragmentBestIndexes, channelWeightsSq, scTables, numSCTables, rtn); 4336 } 4337 } // shapeIter 4338 4339 for (uint16_t partition = 0; partition < numPartitions; partition++) 4340 { 4341 const int *partitionShapes; 4342 if (numSubsets == 1) 4343 partitionShapes = BC7Data::g_shapes1[partition]; 4344 else if (numSubsets == 2) 4345 partitionShapes = BC7Data::g_shapes2[partition]; 4346 else 4347 { 4348 assert(numSubsets == 3); 4349 partitionShapes = BC7Data::g_shapes3[partition]; 4350 } 4351 4352 MFloat totalError = ParallelMath::MakeFloatZero(); 4353 for (int subset = 0; subset < numSubsets; subset++) 4354 totalError = totalError + temps.shapeBestError[shapeCollapseList[partitionShapes[subset]]]; 4355 4356 ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(totalError, work.m_error); 4357 ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter); 4358 4359 if (ParallelMath::AnySet(errorBetter16)) 4360 { 4361 for (int subset = 0; subset < numSubsets; subset++) 4362 { 4363 int shape = partitionShapes[subset]; 4364 int shapeStart = BC7Data::g_shapeRanges[shape][0]; 4365 int shapeLength = BC7Data::g_shapeRanges[shape][1]; 4366 int shapeCollapsedEvalIndex = shapeCollapseList[shape]; 4367 4368 for (int epi = 0; epi < 2; epi++) 4369 for (int ch = 0; ch < 4; ch++) 4370 ParallelMath::ConditionalSet(work.m_ep[subset][epi][ch], errorBetter16, temps.shapeBestEP[shapeCollapsedEvalIndex][epi][ch]); 4371 4372 for (int pxi = 0; pxi < shapeLength; pxi++) 4373 { 4374 int px = BC7Data::g_fragments[shapeStart + pxi]; 4375 ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, temps.fragmentBestIndexes[shapeStart + pxi]); 4376 } 4377 } 4378 4379 work.m_error = ParallelMath::Min(totalError, work.m_error); 4380 ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode)); 4381 ParallelMath::ConditionalSet(work.m_u.m_partition, errorBetter16, ParallelMath::MakeUInt15(partition)); 4382 } 4383 } 4384 } 4385 } 4386 TryDualPlane(uint32_t flags,const MUInt15 pixels[16][4],const MFloat floatPixels[16][4],const float channelWeights[4],int numTweakRounds,int numRefineRounds,WorkInfo & work,const ParallelMath::RoundTowardNearestForScope * rtn)4387 static void TryDualPlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], int numTweakRounds, int numRefineRounds, WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn) 4388 { 4389 // TODO: These error calculations are not optimal for weight-by-alpha, but this routine needs to be mostly rewritten for that. 4390 // The alpha/color solutions are co-dependent in that case, but a good way to solve it would probably be to 4391 // solve the alpha channel first, then solve the RGB channels, which in turn breaks down into two cases: 4392 // - Separate alpha channel, then weighted RGB 4393 // - Alpha+2 other channels, then the independent channel 4394 4395 if (!(flags & Flags::BC7_EnableDualPlane)) 4396 return; 4397 4398 if (numRefineRounds < 1) 4399 numRefineRounds = 1; 4400 4401 if (numTweakRounds < 1) 4402 numTweakRounds = 1; 4403 else if (numTweakRounds > MaxTweakRounds) 4404 numTweakRounds = MaxTweakRounds; 4405 4406 float channelWeightsSq[4]; 4407 for (int ch = 0; ch < 4; ch++) 4408 channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch]; 4409 4410 for (uint16_t mode = 4; mode <= 5; mode++) 4411 { 4412 for (uint16_t rotation = 0; rotation < 4; rotation++) 4413 { 4414 int alphaChannel = (rotation + 3) & 3; 4415 int redChannel = (rotation == 1) ? 3 : 0; 4416 int greenChannel = (rotation == 2) ? 3 : 1; 4417 int blueChannel = (rotation == 3) ? 3 : 2; 4418 4419 MUInt15 rotatedRGB[16][3]; 4420 MFloat floatRotatedRGB[16][3]; 4421 4422 for (int px = 0; px < 16; px++) 4423 { 4424 rotatedRGB[px][0] = pixels[px][redChannel]; 4425 rotatedRGB[px][1] = pixels[px][greenChannel]; 4426 rotatedRGB[px][2] = pixels[px][blueChannel]; 4427 4428 for (int ch = 0; ch < 3; ch++) 4429 floatRotatedRGB[px][ch] = ParallelMath::ToFloat(rotatedRGB[px][ch]); 4430 } 4431 4432 uint16_t maxIndexSelector = (mode == 4) ? 2 : 1; 4433 4434 float rotatedRGBWeights[3] = { channelWeights[redChannel], channelWeights[greenChannel], channelWeights[blueChannel] }; 4435 float rotatedRGBWeightsSq[3] = { channelWeightsSq[redChannel], channelWeightsSq[greenChannel], channelWeightsSq[blueChannel] }; 4436 float rotatedAlphaWeight[1] = { channelWeights[alphaChannel] }; 4437 float rotatedAlphaWeightSq[1] = { channelWeightsSq[alphaChannel] }; 4438 4439 float uniformWeight[1] = { 1.0f }; // Since the alpha channel is independent, there's no need to bother with weights when doing refinement or selection, only error 4440 4441 MFloat preWeightedRotatedRGB[16][3]; 4442 BCCommon::PreWeightPixelsLDR<3>(preWeightedRotatedRGB, rotatedRGB, rotatedRGBWeights); 4443 4444 for (uint16_t indexSelector = 0; indexSelector < maxIndexSelector; indexSelector++) 4445 { 4446 EndpointSelector<3, 8> rgbSelector; 4447 4448 for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++) 4449 { 4450 for (int px = 0; px < 16; px++) 4451 rgbSelector.ContributePass(preWeightedRotatedRGB[px], epPass, ParallelMath::MakeFloat(1.0f)); 4452 4453 rgbSelector.FinishPass(epPass); 4454 } 4455 4456 MUInt15 alphaRange[2]; 4457 4458 alphaRange[0] = alphaRange[1] = pixels[0][alphaChannel]; 4459 for (int px = 1; px < 16; px++) 4460 { 4461 alphaRange[0] = ParallelMath::Min(pixels[px][alphaChannel], alphaRange[0]); 4462 alphaRange[1] = ParallelMath::Max(pixels[px][alphaChannel], alphaRange[1]); 4463 } 4464 4465 int rgbPrec = 0; 4466 int alphaPrec = 0; 4467 4468 if (mode == 4) 4469 { 4470 rgbPrec = indexSelector ? 3 : 2; 4471 alphaPrec = indexSelector ? 2 : 3; 4472 } 4473 else 4474 rgbPrec = alphaPrec = 2; 4475 4476 UnfinishedEndpoints<3> unfinishedRGB = rgbSelector.GetEndpoints(rotatedRGBWeights); 4477 4478 MFloat bestRGBError = ParallelMath::MakeFloat(FLT_MAX); 4479 MFloat bestAlphaError = ParallelMath::MakeFloat(FLT_MAX); 4480 4481 MUInt15 bestRGBIndexes[16]; 4482 MUInt15 bestAlphaIndexes[16]; 4483 MUInt15 bestEP[2][4]; 4484 4485 for (int px = 0; px < 16; px++) 4486 bestRGBIndexes[px] = bestAlphaIndexes[px] = ParallelMath::MakeUInt15(0); 4487 4488 for (int tweak = 0; tweak < numTweakRounds; tweak++) 4489 { 4490 MUInt15 rgbEP[2][3]; 4491 MUInt15 alphaEP[2]; 4492 4493 unfinishedRGB.FinishLDR(tweak, 1 << rgbPrec, rgbEP[0], rgbEP[1]); 4494 4495 TweakAlpha(alphaRange, tweak, 1 << alphaPrec, alphaEP); 4496 4497 for (int refine = 0; refine < numRefineRounds; refine++) 4498 { 4499 if (mode == 4) 4500 CompressEndpoints4(rgbEP, alphaEP, rtn); 4501 else 4502 CompressEndpoints5(rgbEP, alphaEP, rtn); 4503 4504 4505 IndexSelector<1> alphaIndexSelector; 4506 IndexSelector<3> rgbIndexSelector; 4507 4508 { 4509 MUInt15 alphaEPTemp[2][1] = { { alphaEP[0] },{ alphaEP[1] } }; 4510 alphaIndexSelector.Init<false>(uniformWeight, alphaEPTemp, 1 << alphaPrec); 4511 } 4512 rgbIndexSelector.Init<false>(rotatedRGBWeights, rgbEP, 1 << rgbPrec); 4513 4514 EndpointRefiner<3> rgbRefiner; 4515 EndpointRefiner<1> alphaRefiner; 4516 4517 rgbRefiner.Init(1 << rgbPrec, rotatedRGBWeights); 4518 alphaRefiner.Init(1 << alphaPrec, uniformWeight); 4519 4520 MFloat errorRGB = ParallelMath::MakeFloatZero(); 4521 MFloat errorA = ParallelMath::MakeFloatZero(); 4522 4523 MUInt15 rgbIndexes[16]; 4524 MUInt15 alphaIndexes[16]; 4525 4526 AggregatedError<3> rgbAggError; 4527 AggregatedError<1> alphaAggError; 4528 4529 for (int px = 0; px < 16; px++) 4530 { 4531 MUInt15 rgbIndex = rgbIndexSelector.SelectIndexLDR(floatRotatedRGB[px], rtn); 4532 MUInt15 alphaIndex = alphaIndexSelector.SelectIndexLDR(floatPixels[px] + alphaChannel, rtn); 4533 4534 MUInt15 reconstructedRGB[3]; 4535 MUInt15 reconstructedAlpha[1]; 4536 4537 rgbIndexSelector.ReconstructLDR_BC7(rgbIndex, reconstructedRGB); 4538 alphaIndexSelector.ReconstructLDR_BC7(alphaIndex, reconstructedAlpha); 4539 4540 if (flags & cvtt::Flags::BC7_FastIndexing) 4541 { 4542 BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], rgbAggError); 4543 BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, alphaAggError); 4544 } 4545 else 4546 { 4547 AggregatedError<3> baseRGBAggError; 4548 AggregatedError<1> baseAlphaAggError; 4549 4550 BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], baseRGBAggError); 4551 BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, baseAlphaAggError); 4552 4553 MFloat rgbError = baseRGBAggError.Finalize(flags, rotatedRGBWeightsSq); 4554 MFloat alphaError = baseAlphaAggError.Finalize(flags, rotatedAlphaWeightSq); 4555 4556 MUInt15 altRGBIndexes[2]; 4557 MUInt15 altAlphaIndexes[2]; 4558 4559 altRGBIndexes[0] = ParallelMath::Max(rgbIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1); 4560 altRGBIndexes[1] = ParallelMath::Min(rgbIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << rgbPrec) - 1))); 4561 4562 altAlphaIndexes[0] = ParallelMath::Max(alphaIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1); 4563 altAlphaIndexes[1] = ParallelMath::Min(alphaIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << alphaPrec) - 1))); 4564 4565 for (int ii = 0; ii < 2; ii++) 4566 { 4567 rgbIndexSelector.ReconstructLDR_BC7(altRGBIndexes[ii], reconstructedRGB); 4568 alphaIndexSelector.ReconstructLDR_BC7(altAlphaIndexes[ii], reconstructedAlpha); 4569 4570 AggregatedError<3> altRGBAggError; 4571 AggregatedError<1> altAlphaAggError; 4572 4573 BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], altRGBAggError); 4574 BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, altAlphaAggError); 4575 4576 MFloat altRGBError = altRGBAggError.Finalize(flags, rotatedRGBWeightsSq); 4577 MFloat altAlphaError = altAlphaAggError.Finalize(flags, rotatedAlphaWeightSq); 4578 4579 ParallelMath::Int16CompFlag rgbBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altRGBError, rgbError)); 4580 ParallelMath::Int16CompFlag alphaBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altAlphaError, alphaError)); 4581 4582 rgbError = ParallelMath::Min(altRGBError, rgbError); 4583 alphaError = ParallelMath::Min(altAlphaError, alphaError); 4584 4585 ParallelMath::ConditionalSet(rgbIndex, rgbBetter, altRGBIndexes[ii]); 4586 ParallelMath::ConditionalSet(alphaIndex, alphaBetter, altAlphaIndexes[ii]); 4587 } 4588 4589 errorRGB = errorRGB + rgbError; 4590 errorA = errorA + alphaError; 4591 } 4592 4593 if (refine != numRefineRounds - 1) 4594 { 4595 rgbRefiner.ContributeUnweightedPW(preWeightedRotatedRGB[px], rgbIndex); 4596 alphaRefiner.ContributeUnweightedPW(floatPixels[px] + alphaChannel, alphaIndex); 4597 } 4598 4599 if (flags & Flags::BC7_FastIndexing) 4600 { 4601 errorRGB = rgbAggError.Finalize(flags, rotatedRGBWeightsSq); 4602 errorA = rgbAggError.Finalize(flags, rotatedAlphaWeightSq); 4603 } 4604 4605 rgbIndexes[px] = rgbIndex; 4606 alphaIndexes[px] = alphaIndex; 4607 } 4608 4609 ParallelMath::FloatCompFlag rgbBetter = ParallelMath::Less(errorRGB, bestRGBError); 4610 ParallelMath::FloatCompFlag alphaBetter = ParallelMath::Less(errorA, bestAlphaError); 4611 4612 ParallelMath::Int16CompFlag rgbBetterInt16 = ParallelMath::FloatFlagToInt16(rgbBetter); 4613 ParallelMath::Int16CompFlag alphaBetterInt16 = ParallelMath::FloatFlagToInt16(alphaBetter); 4614 4615 if (ParallelMath::AnySet(rgbBetterInt16)) 4616 { 4617 bestRGBError = ParallelMath::Min(errorRGB, bestRGBError); 4618 4619 for (int px = 0; px < 16; px++) 4620 ParallelMath::ConditionalSet(bestRGBIndexes[px], rgbBetterInt16, rgbIndexes[px]); 4621 4622 for (int ep = 0; ep < 2; ep++) 4623 { 4624 for (int ch = 0; ch < 3; ch++) 4625 ParallelMath::ConditionalSet(bestEP[ep][ch], rgbBetterInt16, rgbEP[ep][ch]); 4626 } 4627 } 4628 4629 if (ParallelMath::AnySet(alphaBetterInt16)) 4630 { 4631 bestAlphaError = ParallelMath::Min(errorA, bestAlphaError); 4632 4633 for (int px = 0; px < 16; px++) 4634 ParallelMath::ConditionalSet(bestAlphaIndexes[px], alphaBetterInt16, alphaIndexes[px]); 4635 4636 for (int ep = 0; ep < 2; ep++) 4637 ParallelMath::ConditionalSet(bestEP[ep][3], alphaBetterInt16, alphaEP[ep]); 4638 } 4639 4640 if (refine != numRefineRounds - 1) 4641 { 4642 rgbRefiner.GetRefinedEndpointsLDR(rgbEP, rtn); 4643 4644 MUInt15 alphaEPTemp[2][1]; 4645 alphaRefiner.GetRefinedEndpointsLDR(alphaEPTemp, rtn); 4646 4647 for (int i = 0; i < 2; i++) 4648 alphaEP[i] = alphaEPTemp[i][0]; 4649 } 4650 } // refine 4651 } // tweak 4652 4653 MFloat combinedError = bestRGBError + bestAlphaError; 4654 4655 ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, work.m_error); 4656 ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter); 4657 4658 work.m_error = ParallelMath::Min(combinedError, work.m_error); 4659 4660 ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode)); 4661 ParallelMath::ConditionalSet(work.m_u.m_isr.m_rotation, errorBetter16, ParallelMath::MakeUInt15(rotation)); 4662 ParallelMath::ConditionalSet(work.m_u.m_isr.m_indexSelector, errorBetter16, ParallelMath::MakeUInt15(indexSelector)); 4663 4664 for (int px = 0; px < 16; px++) 4665 { 4666 ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, indexSelector ? bestAlphaIndexes[px] : bestRGBIndexes[px]); 4667 ParallelMath::ConditionalSet(work.m_indexes2[px], errorBetter16, indexSelector ? bestRGBIndexes[px] : bestAlphaIndexes[px]); 4668 } 4669 4670 for (int ep = 0; ep < 2; ep++) 4671 for (int ch = 0; ch < 4; ch++) 4672 ParallelMath::ConditionalSet(work.m_ep[0][ep][ch], errorBetter16, bestEP[ep][ch]); 4673 } 4674 } 4675 } 4676 } 4677 4678 template<class T> Swap(T & a,T & b)4679 static void Swap(T& a, T& b) 4680 { 4681 T temp = a; 4682 a = b; 4683 b = temp; 4684 } 4685 Pack(uint32_t flags,const PixelBlockU8 * inputs,uint8_t * packedBlocks,const float channelWeights[4],int numTweakRounds,int numRefineRounds)4686 static void Pack(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, const float channelWeights[4], int numTweakRounds, int numRefineRounds) 4687 { 4688 MUInt15 pixels[16][4]; 4689 MFloat floatPixels[16][4]; 4690 4691 for (int px = 0; px < 16; px++) 4692 { 4693 for (int ch = 0; ch < 4; ch++) 4694 ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]); 4695 } 4696 4697 for (int px = 0; px < 16; px++) 4698 { 4699 for (int ch = 0; ch < 4; ch++) 4700 floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]); 4701 } 4702 4703 WorkInfo work; 4704 memset(&work, 0, sizeof(work)); 4705 4706 work.m_error = ParallelMath::MakeFloat(FLT_MAX); 4707 4708 { 4709 ParallelMath::RoundTowardNearestForScope rtn; 4710 TrySinglePlane(flags, pixels, floatPixels, channelWeights, numTweakRounds, numRefineRounds, work, &rtn); 4711 TryDualPlane(flags, pixels, floatPixels, channelWeights, numTweakRounds, numRefineRounds, work, &rtn); 4712 } 4713 4714 for (int block = 0; block < ParallelMath::ParallelSize; block++) 4715 { 4716 PackingVector pv; 4717 pv.Init(); 4718 4719 ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(work.m_mode, block); 4720 ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(work.m_u.m_partition, block); 4721 ParallelMath::ScalarUInt16 indexSelector = ParallelMath::Extract(work.m_u.m_isr.m_indexSelector, block); 4722 4723 const BC7Data::BC7ModeInfo& modeInfo = BC7Data::g_modes[mode]; 4724 4725 ParallelMath::ScalarUInt16 indexes[16]; 4726 ParallelMath::ScalarUInt16 indexes2[16]; 4727 ParallelMath::ScalarUInt16 endPoints[3][2][4]; 4728 4729 for (int i = 0; i < 16; i++) 4730 { 4731 indexes[i] = ParallelMath::Extract(work.m_indexes[i], block); 4732 if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate) 4733 indexes2[i] = ParallelMath::Extract(work.m_indexes2[i], block); 4734 } 4735 4736 for (int subset = 0; subset < 3; subset++) 4737 { 4738 for (int ep = 0; ep < 2; ep++) 4739 { 4740 for (int ch = 0; ch < 4; ch++) 4741 endPoints[subset][ep][ch] = ParallelMath::Extract(work.m_ep[subset][ep][ch], block); 4742 } 4743 } 4744 4745 int fixups[3] = { 0, 0, 0 }; 4746 4747 if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate) 4748 { 4749 bool flipRGB = ((indexes[0] & (1 << (modeInfo.m_indexBits - 1))) != 0); 4750 bool flipAlpha = ((indexes2[0] & (1 << (modeInfo.m_alphaIndexBits - 1))) != 0); 4751 4752 if (flipRGB) 4753 { 4754 uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1; 4755 for (int px = 0; px < 16; px++) 4756 indexes[px] = highIndex - indexes[px]; 4757 } 4758 4759 if (flipAlpha) 4760 { 4761 uint16_t highIndex = (1 << modeInfo.m_alphaIndexBits) - 1; 4762 for (int px = 0; px < 16; px++) 4763 indexes2[px] = highIndex - indexes2[px]; 4764 } 4765 4766 if (indexSelector) 4767 Swap(flipRGB, flipAlpha); 4768 4769 if (flipRGB) 4770 { 4771 for (int ch = 0; ch < 3; ch++) 4772 Swap(endPoints[0][0][ch], endPoints[0][1][ch]); 4773 } 4774 if (flipAlpha) 4775 Swap(endPoints[0][0][3], endPoints[0][1][3]); 4776 4777 } 4778 else 4779 { 4780 if (modeInfo.m_numSubsets == 2) 4781 fixups[1] = BC7Data::g_fixupIndexes2[partition]; 4782 else if (modeInfo.m_numSubsets == 3) 4783 { 4784 fixups[1] = BC7Data::g_fixupIndexes3[partition][0]; 4785 fixups[2] = BC7Data::g_fixupIndexes3[partition][1]; 4786 } 4787 4788 bool flip[3] = { false, false, false }; 4789 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++) 4790 flip[subset] = ((indexes[fixups[subset]] & (1 << (modeInfo.m_indexBits - 1))) != 0); 4791 4792 if (flip[0] || flip[1] || flip[2]) 4793 { 4794 uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1; 4795 for (int px = 0; px < 16; px++) 4796 { 4797 int subset = 0; 4798 if (modeInfo.m_numSubsets == 2) 4799 subset = (BC7Data::g_partitionMap[partition] >> px) & 1; 4800 else if (modeInfo.m_numSubsets == 3) 4801 subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3; 4802 4803 if (flip[subset]) 4804 indexes[px] = highIndex - indexes[px]; 4805 } 4806 4807 int maxCH = (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined) ? 4 : 3; 4808 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++) 4809 { 4810 if (flip[subset]) 4811 for (int ch = 0; ch < maxCH; ch++) 4812 Swap(endPoints[subset][0][ch], endPoints[subset][1][ch]); 4813 } 4814 } 4815 } 4816 4817 pv.Pack(static_cast<uint8_t>(1 << mode), mode + 1); 4818 4819 if (modeInfo.m_partitionBits) 4820 pv.Pack(partition, modeInfo.m_partitionBits); 4821 4822 if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate) 4823 { 4824 ParallelMath::ScalarUInt16 rotation = ParallelMath::Extract(work.m_u.m_isr.m_rotation, block); 4825 pv.Pack(rotation, 2); 4826 } 4827 4828 if (modeInfo.m_hasIndexSelector) 4829 pv.Pack(indexSelector, 1); 4830 4831 // Encode RGB 4832 for (int ch = 0; ch < 3; ch++) 4833 { 4834 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++) 4835 { 4836 for (int ep = 0; ep < 2; ep++) 4837 { 4838 ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][ch]; 4839 epPart >>= (8 - modeInfo.m_rgbBits); 4840 4841 pv.Pack(epPart, modeInfo.m_rgbBits); 4842 } 4843 } 4844 } 4845 4846 // Encode alpha 4847 if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None) 4848 { 4849 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++) 4850 { 4851 for (int ep = 0; ep < 2; ep++) 4852 { 4853 ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][3]; 4854 epPart >>= (8 - modeInfo.m_alphaBits); 4855 4856 pv.Pack(epPart, modeInfo.m_alphaBits); 4857 } 4858 } 4859 } 4860 4861 // Encode parity bits 4862 if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset) 4863 { 4864 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++) 4865 { 4866 ParallelMath::ScalarUInt16 epPart = endPoints[subset][0][0]; 4867 epPart >>= (7 - modeInfo.m_rgbBits); 4868 epPart &= 1; 4869 4870 pv.Pack(epPart, 1); 4871 } 4872 } 4873 else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint) 4874 { 4875 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++) 4876 { 4877 for (int ep = 0; ep < 2; ep++) 4878 { 4879 ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][0]; 4880 epPart >>= (7 - modeInfo.m_rgbBits); 4881 epPart &= 1; 4882 4883 pv.Pack(epPart, 1); 4884 } 4885 } 4886 } 4887 4888 // Encode indexes 4889 for (int px = 0; px < 16; px++) 4890 { 4891 int bits = modeInfo.m_indexBits; 4892 if ((px == 0) || (px == fixups[1]) || (px == fixups[2])) 4893 bits--; 4894 4895 pv.Pack(indexes[px], bits); 4896 } 4897 4898 // Encode secondary indexes 4899 if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate) 4900 { 4901 for (int px = 0; px < 16; px++) 4902 { 4903 int bits = modeInfo.m_alphaIndexBits; 4904 if (px == 0) 4905 bits--; 4906 4907 pv.Pack(indexes2[px], bits); 4908 } 4909 } 4910 4911 pv.Flush(packedBlocks); 4912 4913 packedBlocks += 16; 4914 } 4915 } 4916 UnpackOne(PixelBlockU8 & output,const uint8_t * packedBlock)4917 static void UnpackOne(PixelBlockU8 &output, const uint8_t* packedBlock) 4918 { 4919 UnpackingVector pv; 4920 pv.Init(packedBlock); 4921 4922 int mode = 8; 4923 for (int i = 0; i < 8; i++) 4924 { 4925 if (pv.Unpack(1) == 1) 4926 { 4927 mode = i; 4928 break; 4929 } 4930 } 4931 4932 if (mode > 7) 4933 { 4934 for (int px = 0; px < 16; px++) 4935 for (int ch = 0; ch < 4; ch++) 4936 output.m_pixels[px][ch] = 0; 4937 4938 return; 4939 } 4940 4941 const BC7Data::BC7ModeInfo &modeInfo = BC7Data::g_modes[mode]; 4942 4943 int partition = 0; 4944 if (modeInfo.m_partitionBits) 4945 partition = pv.Unpack(modeInfo.m_partitionBits); 4946 4947 int rotation = 0; 4948 if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate) 4949 rotation = pv.Unpack(2); 4950 4951 int indexSelector = 0; 4952 if (modeInfo.m_hasIndexSelector) 4953 indexSelector = pv.Unpack(1); 4954 4955 // Resolve fixups 4956 int fixups[3] = { 0, 0, 0 }; 4957 4958 if (modeInfo.m_alphaMode != BC7Data::AlphaMode_Separate) 4959 { 4960 if (modeInfo.m_numSubsets == 2) 4961 fixups[1] = BC7Data::g_fixupIndexes2[partition]; 4962 else if (modeInfo.m_numSubsets == 3) 4963 { 4964 fixups[1] = BC7Data::g_fixupIndexes3[partition][0]; 4965 fixups[2] = BC7Data::g_fixupIndexes3[partition][1]; 4966 } 4967 } 4968 4969 int endPoints[3][2][4]; 4970 4971 // Decode RGB 4972 for (int ch = 0; ch < 3; ch++) 4973 { 4974 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++) 4975 { 4976 for (int ep = 0; ep < 2; ep++) 4977 endPoints[subset][ep][ch] = (pv.Unpack(modeInfo.m_rgbBits) << (8 - modeInfo.m_rgbBits)); 4978 } 4979 } 4980 4981 // Decode alpha 4982 if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None) 4983 { 4984 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++) 4985 { 4986 for (int ep = 0; ep < 2; ep++) 4987 endPoints[subset][ep][3] = (pv.Unpack(modeInfo.m_alphaBits) << (8 - modeInfo.m_alphaBits)); 4988 } 4989 } 4990 else 4991 { 4992 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++) 4993 { 4994 for (int ep = 0; ep < 2; ep++) 4995 endPoints[subset][ep][3] = 255; 4996 } 4997 } 4998 4999 int parityBits = 0; 5000 5001 // Decode parity bits 5002 if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset) 5003 { 5004 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++) 5005 { 5006 int p = pv.Unpack(1); 5007 5008 for (int ep = 0; ep < 2; ep++) 5009 { 5010 for (int ch = 0; ch < 3; ch++) 5011 endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits); 5012 5013 if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None) 5014 endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits); 5015 } 5016 } 5017 5018 parityBits = 1; 5019 } 5020 else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint) 5021 { 5022 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++) 5023 { 5024 for (int ep = 0; ep < 2; ep++) 5025 { 5026 int p = pv.Unpack(1); 5027 5028 for (int ch = 0; ch < 3; ch++) 5029 endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits); 5030 5031 if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None) 5032 endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits); 5033 } 5034 } 5035 5036 parityBits = 1; 5037 } 5038 5039 // Fill endpoint bits 5040 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++) 5041 { 5042 for (int ep = 0; ep < 2; ep++) 5043 { 5044 for (int ch = 0; ch < 3; ch++) 5045 endPoints[subset][ep][ch] |= (endPoints[subset][ep][ch] >> (modeInfo.m_rgbBits + parityBits)); 5046 5047 if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None) 5048 endPoints[subset][ep][3] |= (endPoints[subset][ep][3] >> (modeInfo.m_alphaBits + parityBits)); 5049 } 5050 } 5051 5052 int indexes[16]; 5053 int indexes2[16]; 5054 5055 // Decode indexes 5056 for (int px = 0; px < 16; px++) 5057 { 5058 int bits = modeInfo.m_indexBits; 5059 if ((px == 0) || (px == fixups[1]) || (px == fixups[2])) 5060 bits--; 5061 5062 indexes[px] = pv.Unpack(bits); 5063 } 5064 5065 // Decode secondary indexes 5066 if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate) 5067 { 5068 for (int px = 0; px < 16; px++) 5069 { 5070 int bits = modeInfo.m_alphaIndexBits; 5071 if (px == 0) 5072 bits--; 5073 5074 indexes2[px] = pv.Unpack(bits); 5075 } 5076 } 5077 else 5078 { 5079 for (int px = 0; px < 16; px++) 5080 indexes2[px] = 0; 5081 } 5082 5083 const int *alphaWeights = BC7Data::g_weightTables[modeInfo.m_alphaIndexBits]; 5084 const int *rgbWeights = BC7Data::g_weightTables[modeInfo.m_indexBits]; 5085 5086 // Decode each pixel 5087 for (int px = 0; px < 16; px++) 5088 { 5089 int rgbWeight = 0; 5090 int alphaWeight = 0; 5091 5092 int rgbIndex = indexes[px]; 5093 5094 rgbWeight = rgbWeights[indexes[px]]; 5095 5096 if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined) 5097 alphaWeight = rgbWeight; 5098 else if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate) 5099 alphaWeight = alphaWeights[indexes2[px]]; 5100 5101 if (indexSelector == 1) 5102 { 5103 int temp = rgbWeight; 5104 rgbWeight = alphaWeight; 5105 alphaWeight = temp; 5106 } 5107 5108 int pixel[4] = { 0, 0, 0, 255 }; 5109 5110 int subset = 0; 5111 5112 if (modeInfo.m_numSubsets == 2) 5113 subset = (BC7Data::g_partitionMap[partition] >> px) & 1; 5114 else if (modeInfo.m_numSubsets == 3) 5115 subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3; 5116 5117 for (int ch = 0; ch < 3; ch++) 5118 pixel[ch] = ((64 - rgbWeight) * endPoints[subset][0][ch] + rgbWeight * endPoints[subset][1][ch] + 32) >> 6; 5119 5120 if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None) 5121 pixel[3] = ((64 - alphaWeight) * endPoints[subset][0][3] + alphaWeight * endPoints[subset][1][3] + 32) >> 6; 5122 5123 if (rotation != 0) 5124 { 5125 int ch = rotation - 1; 5126 int temp = pixel[ch]; 5127 pixel[ch] = pixel[3]; 5128 pixel[3] = temp; 5129 } 5130 5131 for (int ch = 0; ch < 4; ch++) 5132 output.m_pixels[px][ch] = static_cast<uint8_t>(pixel[ch]); 5133 } 5134 } 5135 }; 5136 5137 class BC6HComputer 5138 { 5139 public: 5140 typedef ParallelMath::Float MFloat; 5141 typedef ParallelMath::SInt16 MSInt16; 5142 typedef ParallelMath::UInt16 MUInt16; 5143 typedef ParallelMath::UInt15 MUInt15; 5144 typedef ParallelMath::AInt16 MAInt16; 5145 typedef ParallelMath::SInt32 MSInt32; 5146 typedef ParallelMath::UInt31 MUInt31; 5147 5148 static const int MaxTweakRounds = 4; 5149 static const int MaxRefineRounds = 3; 5150 QuantizeSingleEndpointElementSigned(const MSInt16 & elem2CL,int precision,const ParallelMath::RoundUpForScope * ru)5151 static MSInt16 QuantizeSingleEndpointElementSigned(const MSInt16 &elem2CL, int precision, const ParallelMath::RoundUpForScope* ru) 5152 { 5153 assert(ParallelMath::AllSet(ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(31744)))); 5154 assert(ParallelMath::AllSet(ParallelMath::Less(ParallelMath::MakeSInt16(-31744), elem2CL))); 5155 5156 // Expand to full range 5157 ParallelMath::Int16CompFlag isNegative = ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(0)); 5158 MUInt15 absElem = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - elem2CL, elem2CL)); 5159 5160 absElem = ParallelMath::RightShift(ParallelMath::RoundAndConvertToU15(ParallelMath::ToFloat(absElem) * 32.0f / 31.0f, ru), 16 - precision); 5161 5162 MSInt16 absElemS16 = ParallelMath::LosslessCast<MSInt16>::Cast(absElem); 5163 5164 return ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - absElemS16, absElemS16); 5165 } 5166 QuantizeSingleEndpointElementUnsigned(const MUInt15 & elem,int precision,const ParallelMath::RoundUpForScope * ru)5167 static MUInt15 QuantizeSingleEndpointElementUnsigned(const MUInt15 &elem, int precision, const ParallelMath::RoundUpForScope* ru) 5168 { 5169 MUInt16 expandedElem = ParallelMath::RoundAndConvertToU16(ParallelMath::Min(ParallelMath::ToFloat(elem) * 64.0f / 31.0f, ParallelMath::MakeFloat(65535.0f)), ru); 5170 return ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(expandedElem, 16 - precision)); 5171 } 5172 UnquantizeSingleEndpointElementSigned(const MSInt16 & comp,int precision,MSInt16 & outUnquantized,MSInt16 & outUnquantizedFinished2CL)5173 static void UnquantizeSingleEndpointElementSigned(const MSInt16 &comp, int precision, MSInt16 &outUnquantized, MSInt16 &outUnquantizedFinished2CL) 5174 { 5175 MSInt16 zero = ParallelMath::MakeSInt16(0); 5176 5177 ParallelMath::Int16CompFlag negative = ParallelMath::Less(comp, zero); 5178 MUInt15 absComp = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(negative, MSInt16(zero - comp), comp)); 5179 5180 MSInt16 unq; 5181 MUInt15 absUnq; 5182 5183 if (precision >= 16) 5184 { 5185 unq = comp; 5186 absUnq = absComp; 5187 } 5188 else 5189 { 5190 MSInt16 maxCompMinusOne = ParallelMath::MakeSInt16(static_cast<int16_t>((1 << (precision - 1)) - 2)); 5191 ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero); 5192 ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp); 5193 5194 absUnq = (absComp << (16 - precision)) + ParallelMath::MakeUInt15(static_cast<uint16_t>(0x4000 >> (precision - 1))); 5195 ParallelMath::ConditionalSet(absUnq, isZero, ParallelMath::MakeUInt15(0)); 5196 ParallelMath::ConditionalSet(absUnq, isMax, ParallelMath::MakeUInt15(0x7fff)); 5197 5198 unq = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(absUnq)); 5199 } 5200 5201 outUnquantized = unq; 5202 5203 MUInt15 funq = ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(absUnq, ParallelMath::MakeUInt15(31)), 5)); 5204 5205 outUnquantizedFinished2CL = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(funq)); 5206 } 5207 UnquantizeSingleEndpointElementUnsigned(const MUInt15 & comp,int precision,MUInt16 & outUnquantized,MUInt16 & outUnquantizedFinished)5208 static void UnquantizeSingleEndpointElementUnsigned(const MUInt15 &comp, int precision, MUInt16 &outUnquantized, MUInt16 &outUnquantizedFinished) 5209 { 5210 MUInt16 unq = ParallelMath::LosslessCast<MUInt16>::Cast(comp); 5211 if (precision < 15) 5212 { 5213 MUInt15 zero = ParallelMath::MakeUInt15(0); 5214 MUInt15 maxCompMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << precision) - 2)); 5215 5216 ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero); 5217 ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp); 5218 5219 unq = (ParallelMath::LosslessCast<MUInt16>::Cast(comp) << (16 - precision)) + ParallelMath::MakeUInt16(static_cast<uint16_t>(0x8000 >> precision)); 5220 5221 ParallelMath::ConditionalSet(unq, isZero, ParallelMath::MakeUInt16(0)); 5222 ParallelMath::ConditionalSet(unq, isMax, ParallelMath::MakeUInt16(0xffff)); 5223 } 5224 5225 outUnquantized = unq; 5226 outUnquantizedFinished = ParallelMath::ToUInt16(ParallelMath::RightShift(ParallelMath::XMultiply(unq, ParallelMath::MakeUInt15(31)), 6)); 5227 } 5228 QuantizeEndpointsSigned(const MSInt16 endPoints[2][3],const MFloat floatPixelsColorSpace[16][3],const MFloat floatPixelsLinearWeighted[16][3],MAInt16 quantizedEndPoints[2][3],MUInt15 indexes[16],IndexSelectorHDR<3> & indexSelector,int fixupIndex,int precision,int indexRange,const float * channelWeights,bool fastIndexing,const ParallelMath::RoundTowardNearestForScope * rtn)5229 static void QuantizeEndpointsSigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn) 5230 { 5231 MSInt16 unquantizedEP[2][3]; 5232 MSInt16 finishedUnquantizedEP[2][3]; 5233 5234 { 5235 ParallelMath::RoundUpForScope ru; 5236 5237 for (int epi = 0; epi < 2; epi++) 5238 { 5239 for (int ch = 0; ch < 3; ch++) 5240 { 5241 MSInt16 qee = QuantizeSingleEndpointElementSigned(endPoints[epi][ch], precision, &ru); 5242 UnquantizeSingleEndpointElementSigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]); 5243 quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee); 5244 } 5245 } 5246 } 5247 5248 indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange); 5249 indexSelector.InitHDR(indexRange, true, fastIndexing, channelWeights); 5250 5251 MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1); 5252 5253 MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn); 5254 5255 ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index); 5256 5257 if (ParallelMath::AnySet(invert)) 5258 { 5259 ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index)); 5260 5261 indexSelector.ConditionalInvert(invert); 5262 5263 for (int ch = 0; ch < 3; ch++) 5264 { 5265 MAInt16 firstEP = quantizedEndPoints[0][ch]; 5266 MAInt16 secondEP = quantizedEndPoints[1][ch]; 5267 5268 quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP); 5269 quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP); 5270 } 5271 } 5272 5273 indexes[fixupIndex] = index; 5274 } 5275 QuantizeEndpointsUnsigned(const MSInt16 endPoints[2][3],const MFloat floatPixelsColorSpace[16][3],const MFloat floatPixelsLinearWeighted[16][3],MAInt16 quantizedEndPoints[2][3],MUInt15 indexes[16],IndexSelectorHDR<3> & indexSelector,int fixupIndex,int precision,int indexRange,const float * channelWeights,bool fastIndexing,const ParallelMath::RoundTowardNearestForScope * rtn)5276 static void QuantizeEndpointsUnsigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn) 5277 { 5278 MUInt16 unquantizedEP[2][3]; 5279 MUInt16 finishedUnquantizedEP[2][3]; 5280 5281 { 5282 ParallelMath::RoundUpForScope ru; 5283 5284 for (int epi = 0; epi < 2; epi++) 5285 { 5286 for (int ch = 0; ch < 3; ch++) 5287 { 5288 MUInt15 qee = QuantizeSingleEndpointElementUnsigned(ParallelMath::LosslessCast<MUInt15>::Cast(endPoints[epi][ch]), precision, &ru); 5289 UnquantizeSingleEndpointElementUnsigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]); 5290 quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee); 5291 } 5292 } 5293 } 5294 5295 indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange); 5296 indexSelector.InitHDR(indexRange, false, fastIndexing, channelWeights); 5297 5298 MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1); 5299 5300 MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn); 5301 5302 ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index); 5303 5304 if (ParallelMath::AnySet(invert)) 5305 { 5306 ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index)); 5307 5308 indexSelector.ConditionalInvert(invert); 5309 5310 for (int ch = 0; ch < 3; ch++) 5311 { 5312 MAInt16 firstEP = quantizedEndPoints[0][ch]; 5313 MAInt16 secondEP = quantizedEndPoints[1][ch]; 5314 5315 quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP); 5316 quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP); 5317 } 5318 } 5319 5320 indexes[fixupIndex] = index; 5321 } 5322 EvaluatePartitionedLegality(const MAInt16 ep0[2][3],const MAInt16 ep1[2][3],int aPrec,const int bPrec[3],bool isTransformed,MAInt16 outEncodedEPs[2][2][3],ParallelMath::Int16CompFlag & outIsLegal)5323 static void EvaluatePartitionedLegality(const MAInt16 ep0[2][3], const MAInt16 ep1[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][2][3], ParallelMath::Int16CompFlag& outIsLegal) 5324 { 5325 ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true); 5326 5327 MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1)); 5328 5329 for (int ch = 0; ch < 3; ch++) 5330 { 5331 outEncodedEPs[0][0][ch] = ep0[0][ch]; 5332 outEncodedEPs[0][1][ch] = ep0[1][ch]; 5333 outEncodedEPs[1][0][ch] = ep1[0][ch]; 5334 outEncodedEPs[1][1][ch] = ep1[1][ch]; 5335 5336 if (isTransformed) 5337 { 5338 for (int subset = 0; subset < 2; subset++) 5339 { 5340 for (int epi = 0; epi < 2; epi++) 5341 { 5342 if (epi == 0 && subset == 0) 5343 continue; 5344 5345 MAInt16 bReduced = (outEncodedEPs[subset][epi][ch] & aSignificantMask); 5346 5347 MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch])), bPrec[ch]); 5348 5349 outEncodedEPs[subset][epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta); 5350 5351 MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch]) & aSignificantMask); 5352 allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced); 5353 } 5354 } 5355 } 5356 5357 if (!ParallelMath::AnySet(allLegal)) 5358 break; 5359 } 5360 5361 outIsLegal = allLegal; 5362 } 5363 EvaluateSingleLegality(const MAInt16 ep[2][3],int aPrec,const int bPrec[3],bool isTransformed,MAInt16 outEncodedEPs[2][3],ParallelMath::Int16CompFlag & outIsLegal)5364 static void EvaluateSingleLegality(const MAInt16 ep[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][3], ParallelMath::Int16CompFlag& outIsLegal) 5365 { 5366 ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true); 5367 5368 MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1)); 5369 5370 for (int ch = 0; ch < 3; ch++) 5371 { 5372 outEncodedEPs[0][ch] = ep[0][ch]; 5373 outEncodedEPs[1][ch] = ep[1][ch]; 5374 5375 if (isTransformed) 5376 { 5377 MAInt16 bReduced = (outEncodedEPs[1][ch] & aSignificantMask); 5378 5379 MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[1][ch], outEncodedEPs[0][ch])), bPrec[ch]); 5380 5381 outEncodedEPs[1][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta); 5382 5383 MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[1][ch], outEncodedEPs[0][ch]) & aSignificantMask); 5384 allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced); 5385 } 5386 } 5387 5388 outIsLegal = allLegal; 5389 } 5390 Pack(uint32_t flags,const PixelBlockF16 * inputs,uint8_t * packedBlocks,const float channelWeights[4],bool isSigned,int numTweakRounds,int numRefineRounds)5391 static void Pack(uint32_t flags, const PixelBlockF16* inputs, uint8_t* packedBlocks, const float channelWeights[4], bool isSigned, int numTweakRounds, int numRefineRounds) 5392 { 5393 if (numTweakRounds < 1) 5394 numTweakRounds = 1; 5395 else if (numTweakRounds > MaxTweakRounds) 5396 numTweakRounds = MaxTweakRounds; 5397 5398 if (numRefineRounds < 1) 5399 numRefineRounds = 1; 5400 else if (numRefineRounds > MaxRefineRounds) 5401 numRefineRounds = MaxRefineRounds; 5402 5403 bool fastIndexing = ((flags & cvtt::Flags::BC6H_FastIndexing) != 0); 5404 float channelWeightsSq[3]; 5405 5406 ParallelMath::RoundTowardNearestForScope rtn; 5407 5408 MSInt16 pixels[16][3]; 5409 MFloat floatPixels2CL[16][3]; 5410 MFloat floatPixelsLinearWeighted[16][3]; 5411 5412 MSInt16 low15Bits = ParallelMath::MakeSInt16(32767); 5413 5414 for (int ch = 0; ch < 3; ch++) 5415 channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch]; 5416 5417 for (int px = 0; px < 16; px++) 5418 { 5419 for (int ch = 0; ch < 3; ch++) 5420 { 5421 MSInt16 pixelValue; 5422 ParallelMath::ConvertHDRInputs(inputs, px, ch, pixelValue); 5423 5424 // Convert from sign+magnitude to 2CL 5425 if (isSigned) 5426 { 5427 ParallelMath::Int16CompFlag negative = ParallelMath::Less(pixelValue, ParallelMath::MakeSInt16(0)); 5428 MSInt16 magnitude = (pixelValue & low15Bits); 5429 ParallelMath::ConditionalSet(pixelValue, negative, ParallelMath::MakeSInt16(0) - magnitude); 5430 pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(-31743)); 5431 } 5432 else 5433 pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(0)); 5434 5435 pixelValue = ParallelMath::Min(pixelValue, ParallelMath::MakeSInt16(31743)); 5436 5437 pixels[px][ch] = pixelValue; 5438 floatPixels2CL[px][ch] = ParallelMath::ToFloat(pixelValue); 5439 floatPixelsLinearWeighted[px][ch] = ParallelMath::TwosCLHalfToFloat(pixelValue) * channelWeights[ch]; 5440 } 5441 } 5442 5443 MFloat preWeightedPixels[16][3]; 5444 5445 BCCommon::PreWeightPixelsHDR<3>(preWeightedPixels, pixels, channelWeights); 5446 5447 MAInt16 bestEndPoints[2][2][3]; 5448 MUInt15 bestIndexes[16]; 5449 MFloat bestError = ParallelMath::MakeFloat(FLT_MAX); 5450 MUInt15 bestMode = ParallelMath::MakeUInt15(0); 5451 MUInt15 bestPartition = ParallelMath::MakeUInt15(0); 5452 5453 for (int px = 0; px < 16; px++) 5454 bestIndexes[px] = ParallelMath::MakeUInt15(0); 5455 5456 for (int subset = 0; subset < 2; subset++) 5457 for (int epi = 0; epi < 2; epi++) 5458 for (int ch = 0; ch < 3; ch++) 5459 bestEndPoints[subset][epi][ch] = ParallelMath::MakeAInt16(0); 5460 5461 UnfinishedEndpoints<3> partitionedUFEP[32][2]; 5462 UnfinishedEndpoints<3> singleUFEP; 5463 5464 // Generate UFEP for partitions 5465 for (int p = 0; p < 32; p++) 5466 { 5467 int partitionMask = BC7Data::g_partitionMap[p]; 5468 5469 EndpointSelector<3, 8> epSelectors[2]; 5470 5471 for (int pass = 0; pass < NumEndpointSelectorPasses; pass++) 5472 { 5473 for (int px = 0; px < 16; px++) 5474 { 5475 int subset = (partitionMask >> px) & 1; 5476 epSelectors[subset].ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f)); 5477 } 5478 5479 for (int subset = 0; subset < 2; subset++) 5480 epSelectors[subset].FinishPass(pass); 5481 } 5482 5483 for (int subset = 0; subset < 2; subset++) 5484 partitionedUFEP[p][subset] = epSelectors[subset].GetEndpoints(channelWeights); 5485 } 5486 5487 // Generate UFEP for single 5488 { 5489 EndpointSelector<3, 8> epSelector; 5490 5491 for (int pass = 0; pass < NumEndpointSelectorPasses; pass++) 5492 { 5493 for (int px = 0; px < 16; px++) 5494 epSelector.ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f)); 5495 5496 epSelector.FinishPass(pass); 5497 } 5498 5499 singleUFEP = epSelector.GetEndpoints(channelWeights); 5500 } 5501 5502 for (int partitionedInt = 0; partitionedInt < 2; partitionedInt++) 5503 { 5504 bool partitioned = (partitionedInt == 1); 5505 5506 for (int aPrec = BC7Data::g_maxHDRPrecision; aPrec >= 0; aPrec--) 5507 { 5508 if (!BC7Data::g_hdrModesExistForPrecision[partitionedInt][aPrec]) 5509 continue; 5510 5511 int numPartitions = partitioned ? 32 : 1; 5512 int numSubsets = partitioned ? 2 : 1; 5513 int indexBits = partitioned ? 3 : 4; 5514 int indexRange = (1 << indexBits); 5515 5516 for (int p = 0; p < numPartitions; p++) 5517 { 5518 int partitionMask = partitioned ? BC7Data::g_partitionMap[p] : 0; 5519 5520 const int MaxMetaRounds = MaxTweakRounds * MaxRefineRounds; 5521 5522 MAInt16 metaEndPointsQuantized[MaxMetaRounds][2][2][3]; 5523 MUInt15 metaIndexes[MaxMetaRounds][16]; 5524 MFloat metaError[MaxMetaRounds][2]; 5525 5526 bool roundValid[MaxMetaRounds][2]; 5527 5528 for (int r = 0; r < MaxMetaRounds; r++) 5529 for (int subset = 0; subset < 2; subset++) 5530 roundValid[r][subset] = true; 5531 5532 for (int subset = 0; subset < numSubsets; subset++) 5533 { 5534 for (int tweak = 0; tweak < MaxTweakRounds; tweak++) 5535 { 5536 EndpointRefiner<3> refiners[2]; 5537 5538 bool abortRemainingRefines = false; 5539 for (int refinePass = 0; refinePass < MaxRefineRounds; refinePass++) 5540 { 5541 int metaRound = tweak * MaxRefineRounds + refinePass; 5542 5543 if (tweak >= numTweakRounds || refinePass >= numRefineRounds) 5544 abortRemainingRefines = true; 5545 5546 if (abortRemainingRefines) 5547 { 5548 roundValid[metaRound][subset] = false; 5549 continue; 5550 } 5551 5552 MAInt16(&mrQuantizedEndPoints)[2][2][3] = metaEndPointsQuantized[metaRound]; 5553 MUInt15(&mrIndexes)[16] = metaIndexes[metaRound]; 5554 5555 MSInt16 endPointsColorSpace[2][3]; 5556 5557 if (refinePass == 0) 5558 { 5559 UnfinishedEndpoints<3> ufep = partitioned ? partitionedUFEP[p][subset] : singleUFEP; 5560 5561 if (isSigned) 5562 ufep.FinishHDRSigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn); 5563 else 5564 ufep.FinishHDRUnsigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn); 5565 } 5566 else 5567 refiners[subset].GetRefinedEndpointsHDR(endPointsColorSpace, isSigned, &rtn); 5568 5569 refiners[subset].Init(indexRange, channelWeights); 5570 5571 int fixupIndex = (subset == 0) ? 0 : BC7Data::g_fixupIndexes2[p]; 5572 5573 IndexSelectorHDR<3> indexSelector; 5574 if (isSigned) 5575 QuantizeEndpointsSigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn); 5576 else 5577 QuantizeEndpointsUnsigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn); 5578 5579 if (metaRound > 0) 5580 { 5581 ParallelMath::Int16CompFlag anySame = ParallelMath::MakeBoolInt16(false); 5582 5583 for (int prevRound = 0; prevRound < metaRound; prevRound++) 5584 { 5585 MAInt16(&prevRoundEPs)[2][3] = metaEndPointsQuantized[prevRound][subset]; 5586 5587 ParallelMath::Int16CompFlag same = ParallelMath::MakeBoolInt16(true); 5588 5589 for (int epi = 0; epi < 2; epi++) 5590 for (int ch = 0; ch < 3; ch++) 5591 same = (same & ParallelMath::Equal(prevRoundEPs[epi][ch], mrQuantizedEndPoints[subset][epi][ch])); 5592 5593 anySame = (anySame | same); 5594 if (ParallelMath::AllSet(anySame)) 5595 break; 5596 } 5597 5598 if (ParallelMath::AllSet(anySame)) 5599 { 5600 roundValid[metaRound][subset] = false; 5601 continue; 5602 } 5603 } 5604 5605 MFloat subsetError = ParallelMath::MakeFloatZero(); 5606 5607 { 5608 for (int px = 0; px < 16; px++) 5609 { 5610 if (subset != ((partitionMask >> px) & 1)) 5611 continue; 5612 5613 MUInt15 index; 5614 if (px == fixupIndex) 5615 index = mrIndexes[px]; 5616 else 5617 { 5618 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixels2CL[px], &rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[px], &rtn); 5619 mrIndexes[px] = index; 5620 } 5621 5622 MSInt16 reconstructed[3]; 5623 if (isSigned) 5624 indexSelector.ReconstructHDRSigned(mrIndexes[px], reconstructed); 5625 else 5626 indexSelector.ReconstructHDRUnsigned(mrIndexes[px], reconstructed); 5627 5628 subsetError = subsetError + (fastIndexing ? BCCommon::ComputeErrorHDRFast<3>(flags, reconstructed, pixels[px], channelWeightsSq) : BCCommon::ComputeErrorHDRSlow<3>(flags, reconstructed, pixels[px], channelWeightsSq)); 5629 5630 if (refinePass != numRefineRounds - 1) 5631 refiners[subset].ContributeUnweightedPW(preWeightedPixels[px], index); 5632 } 5633 } 5634 5635 metaError[metaRound][subset] = subsetError; 5636 } 5637 } 5638 } 5639 5640 // Now we have a bunch of attempts, but not all of them will fit in the delta coding scheme 5641 int numMeta1 = partitioned ? MaxMetaRounds : 1; 5642 for (int meta0 = 0; meta0 < MaxMetaRounds; meta0++) 5643 { 5644 if (!roundValid[meta0][0]) 5645 continue; 5646 5647 for (int meta1 = 0; meta1 < numMeta1; meta1++) 5648 { 5649 MFloat combinedError = metaError[meta0][0]; 5650 if (partitioned) 5651 { 5652 if (!roundValid[meta1][1]) 5653 continue; 5654 5655 combinedError = combinedError + metaError[meta1][1]; 5656 } 5657 5658 ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, bestError); 5659 if (!ParallelMath::AnySet(errorBetter)) 5660 continue; 5661 5662 ParallelMath::Int16CompFlag needsCommit = ParallelMath::FloatFlagToInt16(errorBetter); 5663 5664 // Figure out if this is encodable 5665 for (int mode = 0; mode < BC7Data::g_numHDRModes; mode++) 5666 { 5667 const BC7Data::BC6HModeInfo &modeInfo = BC7Data::g_hdrModes[mode]; 5668 5669 if (modeInfo.m_partitioned != partitioned || modeInfo.m_aPrec != aPrec) 5670 continue; 5671 5672 MAInt16 encodedEPs[2][2][3]; 5673 ParallelMath::Int16CompFlag isLegal; 5674 if (partitioned) 5675 EvaluatePartitionedLegality(metaEndPointsQuantized[meta0][0], metaEndPointsQuantized[meta1][1], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs, isLegal); 5676 else 5677 EvaluateSingleLegality(metaEndPointsQuantized[meta0][0], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs[0], isLegal); 5678 5679 ParallelMath::Int16CompFlag isLegalAndBetter = (ParallelMath::FloatFlagToInt16(errorBetter) & isLegal); 5680 if (!ParallelMath::AnySet(isLegalAndBetter)) 5681 continue; 5682 5683 ParallelMath::FloatCompFlag isLegalAndBetterFloat = ParallelMath::Int16FlagToFloat(isLegalAndBetter); 5684 5685 ParallelMath::ConditionalSet(bestError, isLegalAndBetterFloat, combinedError); 5686 ParallelMath::ConditionalSet(bestMode, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(mode))); 5687 ParallelMath::ConditionalSet(bestPartition, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(p))); 5688 5689 for (int subset = 0; subset < numSubsets; subset++) 5690 { 5691 for (int epi = 0; epi < 2; epi++) 5692 { 5693 for (int ch = 0; ch < 3; ch++) 5694 ParallelMath::ConditionalSet(bestEndPoints[subset][epi][ch], isLegalAndBetter, encodedEPs[subset][epi][ch]); 5695 } 5696 } 5697 5698 for (int px = 0; px < 16; px++) 5699 { 5700 int subset = ((partitionMask >> px) & 1); 5701 if (subset == 0) 5702 ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta0][px]); 5703 else 5704 ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta1][px]); 5705 } 5706 5707 needsCommit = ParallelMath::AndNot(needsCommit, isLegalAndBetter); 5708 if (!ParallelMath::AnySet(needsCommit)) 5709 break; 5710 } 5711 } 5712 } 5713 } 5714 } 5715 } 5716 5717 // At this point, everything should be set 5718 for (int block = 0; block < ParallelMath::ParallelSize; block++) 5719 { 5720 ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(bestMode, block); 5721 ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(bestPartition, block); 5722 int32_t eps[2][2][3]; 5723 ParallelMath::ScalarUInt16 indexes[16]; 5724 5725 const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode]; 5726 5727 const BC6HData::ModeDescriptor* desc = BC6HData::g_modeDescriptors[mode]; 5728 5729 const size_t headerBits = modeInfo.m_partitioned ? 82 : 65; 5730 5731 for (int subset = 0; subset < 2; subset++) 5732 { 5733 for (int epi = 0; epi < 2; epi++) 5734 { 5735 for (int ch = 0; ch < 3; ch++) 5736 eps[subset][epi][ch] = ParallelMath::Extract(bestEndPoints[subset][epi][ch], block); 5737 } 5738 } 5739 5740 for (int px = 0; px < 16; px++) 5741 indexes[px] = ParallelMath::Extract(bestIndexes[px], block); 5742 5743 uint16_t modeID = modeInfo.m_modeID; 5744 5745 PackingVector pv; 5746 pv.Init(); 5747 5748 for (size_t i = 0; i < headerBits; i++) 5749 { 5750 int32_t codedValue = 0; 5751 switch (desc[i].m_eField) 5752 { 5753 case BC6HData::M: codedValue = modeID; break; 5754 case BC6HData::D: codedValue = partition; break; 5755 case BC6HData::RW: codedValue = eps[0][0][0]; break; 5756 case BC6HData::RX: codedValue = eps[0][1][0]; break; 5757 case BC6HData::RY: codedValue = eps[1][0][0]; break; 5758 case BC6HData::RZ: codedValue = eps[1][1][0]; break; 5759 case BC6HData::GW: codedValue = eps[0][0][1]; break; 5760 case BC6HData::GX: codedValue = eps[0][1][1]; break; 5761 case BC6HData::GY: codedValue = eps[1][0][1]; break; 5762 case BC6HData::GZ: codedValue = eps[1][1][1]; break; 5763 case BC6HData::BW: codedValue = eps[0][0][2]; break; 5764 case BC6HData::BX: codedValue = eps[0][1][2]; break; 5765 case BC6HData::BY: codedValue = eps[1][0][2]; break; 5766 case BC6HData::BZ: codedValue = eps[1][1][2]; break; 5767 default: assert(false); break; 5768 } 5769 5770 pv.Pack(static_cast<uint16_t>((codedValue >> desc[i].m_uBit) & 1), 1); 5771 } 5772 5773 int fixupIndex1 = 0; 5774 int indexBits = 4; 5775 if (modeInfo.m_partitioned) 5776 { 5777 fixupIndex1 = BC7Data::g_fixupIndexes2[partition]; 5778 indexBits = 3; 5779 } 5780 5781 for (int px = 0; px < 16; px++) 5782 { 5783 ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[px], block); 5784 if (px == 0 || px == fixupIndex1) 5785 pv.Pack(index, indexBits - 1); 5786 else 5787 pv.Pack(index, indexBits); 5788 } 5789 5790 pv.Flush(packedBlocks + 16 * block); 5791 } 5792 } 5793 SignExtendSingle(int & v,int bits)5794 static void SignExtendSingle(int &v, int bits) 5795 { 5796 if (v & (1 << (bits - 1))) 5797 v |= -(1 << bits); 5798 } 5799 UnpackOne(PixelBlockF16 & output,const uint8_t * pBC,bool isSigned)5800 static void UnpackOne(PixelBlockF16 &output, const uint8_t *pBC, bool isSigned) 5801 { 5802 UnpackingVector pv; 5803 pv.Init(pBC); 5804 5805 int numModeBits = 2; 5806 int modeBits = pv.Unpack(2); 5807 if (modeBits != 0 && modeBits != 1) 5808 { 5809 modeBits |= pv.Unpack(3) << 2; 5810 numModeBits += 3; 5811 } 5812 5813 int mode = -1; 5814 for (int possibleMode = 0; possibleMode < BC7Data::g_numHDRModes; possibleMode++) 5815 { 5816 if (BC7Data::g_hdrModes[possibleMode].m_modeID == modeBits) 5817 { 5818 mode = possibleMode; 5819 break; 5820 } 5821 } 5822 5823 if (mode < 0) 5824 { 5825 for (int px = 0; px < 16; px++) 5826 { 5827 for (int ch = 0; ch < 3; ch++) 5828 output.m_pixels[px][ch] = 0; 5829 output.m_pixels[px][3] = 0x3c00; // 1.0 5830 } 5831 return; 5832 } 5833 5834 const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode]; 5835 const size_t headerBits = modeInfo.m_partitioned ? 82 : 65; 5836 const BC6HData::ModeDescriptor* desc = BC6HData::g_modeDescriptors[mode]; 5837 5838 int32_t partition = 0; 5839 int32_t eps[2][2][3]; 5840 5841 for (int subset = 0; subset < 2; subset++) 5842 for (int epi = 0; epi < 2; epi++) 5843 for (int ch = 0; ch < 3; ch++) 5844 eps[subset][epi][ch] = 0; 5845 5846 for (size_t i = numModeBits; i < headerBits; i++) 5847 { 5848 int32_t *pCodedValue = NULL; 5849 5850 switch (desc[i].m_eField) 5851 { 5852 case BC6HData::D: pCodedValue = &partition; break; 5853 case BC6HData::RW: pCodedValue = &eps[0][0][0]; break; 5854 case BC6HData::RX: pCodedValue = &eps[0][1][0]; break; 5855 case BC6HData::RY: pCodedValue = &eps[1][0][0]; break; 5856 case BC6HData::RZ: pCodedValue = &eps[1][1][0]; break; 5857 case BC6HData::GW: pCodedValue = &eps[0][0][1]; break; 5858 case BC6HData::GX: pCodedValue = &eps[0][1][1]; break; 5859 case BC6HData::GY: pCodedValue = &eps[1][0][1]; break; 5860 case BC6HData::GZ: pCodedValue = &eps[1][1][1]; break; 5861 case BC6HData::BW: pCodedValue = &eps[0][0][2]; break; 5862 case BC6HData::BX: pCodedValue = &eps[0][1][2]; break; 5863 case BC6HData::BY: pCodedValue = &eps[1][0][2]; break; 5864 case BC6HData::BZ: pCodedValue = &eps[1][1][2]; break; 5865 default: assert(false); break; 5866 } 5867 5868 (*pCodedValue) |= pv.Unpack(1) << desc[i].m_uBit; 5869 } 5870 5871 5872 uint16_t modeID = modeInfo.m_modeID; 5873 5874 int fixupIndex1 = 0; 5875 int indexBits = 4; 5876 int numSubsets = 1; 5877 if (modeInfo.m_partitioned) 5878 { 5879 fixupIndex1 = BC7Data::g_fixupIndexes2[partition]; 5880 indexBits = 3; 5881 numSubsets = 2; 5882 } 5883 5884 int indexes[16]; 5885 for (int px = 0; px < 16; px++) 5886 { 5887 if (px == 0 || px == fixupIndex1) 5888 indexes[px] = pv.Unpack(indexBits - 1); 5889 else 5890 indexes[px] = pv.Unpack(indexBits); 5891 } 5892 5893 if (modeInfo.m_partitioned) 5894 { 5895 for (int ch = 0; ch < 3; ch++) 5896 { 5897 if (isSigned) 5898 SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec); 5899 if (modeInfo.m_transformed || isSigned) 5900 { 5901 SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]); 5902 SignExtendSingle(eps[1][0][ch], modeInfo.m_bPrec[ch]); 5903 SignExtendSingle(eps[1][1][ch], modeInfo.m_bPrec[ch]); 5904 } 5905 } 5906 } 5907 else 5908 { 5909 for (int ch = 0; ch < 3; ch++) 5910 { 5911 if (isSigned) 5912 SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec); 5913 if (modeInfo.m_transformed || isSigned) 5914 SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]); 5915 } 5916 } 5917 5918 int aPrec = modeInfo.m_aPrec; 5919 5920 if (modeInfo.m_transformed) 5921 { 5922 for (int ch = 0; ch < 3; ch++) 5923 { 5924 int wrapMask = (1 << aPrec) - 1; 5925 5926 eps[0][1][ch] = ((eps[0][0][ch] + eps[0][1][ch]) & wrapMask); 5927 if (isSigned) 5928 SignExtendSingle(eps[0][1][ch], aPrec); 5929 5930 if (modeInfo.m_partitioned) 5931 { 5932 eps[1][0][ch] = ((eps[0][0][ch] + eps[1][0][ch]) & wrapMask); 5933 eps[1][1][ch] = ((eps[0][0][ch] + eps[1][1][ch]) & wrapMask); 5934 5935 if (isSigned) 5936 { 5937 SignExtendSingle(eps[1][0][ch], aPrec); 5938 SignExtendSingle(eps[1][1][ch], aPrec); 5939 } 5940 } 5941 } 5942 } 5943 5944 // Unquantize endpoints 5945 for (int subset = 0; subset < numSubsets; subset++) 5946 { 5947 for (int epi = 0; epi < 2; epi++) 5948 { 5949 for (int ch = 0; ch < 3; ch++) 5950 { 5951 int &v = eps[subset][epi][ch]; 5952 5953 if (isSigned) 5954 { 5955 if (aPrec >= 16) 5956 { 5957 // Nothing 5958 } 5959 else 5960 { 5961 bool s = false; 5962 int comp = v; 5963 if (v < 0) 5964 { 5965 s = true; 5966 comp = -comp; 5967 } 5968 5969 int unq = 0; 5970 if (comp == 0) 5971 unq = 0; 5972 else if (comp >= ((1 << (aPrec - 1)) - 1)) 5973 unq = 0x7fff; 5974 else 5975 unq = ((comp << 15) + 0x4000) >> (aPrec - 1); 5976 5977 if (s) 5978 unq = -unq; 5979 5980 v = unq; 5981 } 5982 } 5983 else 5984 { 5985 if (aPrec >= 15) 5986 { 5987 // Nothing 5988 } 5989 else if (v == 0) 5990 { 5991 // Nothing 5992 } 5993 else if (v == ((1 << aPrec) - 1)) 5994 v = 0xffff; 5995 else 5996 v = ((v << 16) + 0x8000) >> aPrec; 5997 } 5998 } 5999 } 6000 } 6001 6002 const int *weights = BC7Data::g_weightTables[indexBits]; 6003 6004 for (int px = 0; px < 16; px++) 6005 { 6006 int subset = 0; 6007 if (modeInfo.m_partitioned) 6008 subset = (BC7Data::g_partitionMap[partition] >> px) & 1; 6009 6010 int w = weights[indexes[px]]; 6011 for (int ch = 0; ch < 3; ch++) 6012 { 6013 int comp = ((64 - w) * eps[subset][0][ch] + w * eps[subset][1][ch] + 32) >> 6; 6014 6015 if (isSigned) 6016 { 6017 if (comp < 0) 6018 comp = -(((-comp) * 31) >> 5); 6019 else 6020 comp = (comp * 31) >> 5; 6021 6022 int s = 0; 6023 if (comp < 0) 6024 { 6025 s = 0x8000; 6026 comp = -comp; 6027 } 6028 6029 output.m_pixels[px][ch] = static_cast<uint16_t>(s | comp); 6030 } 6031 else 6032 { 6033 comp = (comp * 31) >> 6; 6034 output.m_pixels[px][ch] = static_cast<uint16_t>(comp); 6035 } 6036 } 6037 output.m_pixels[px][3] = 0x3c00; // 1.0 6038 } 6039 } 6040 }; 6041 6042 namespace S3TCSingleColorTables 6043 { 6044 struct SingleColorTableEntry 6045 { 6046 uint8_t m_min; 6047 uint8_t m_max; 6048 uint8_t m_actualColor; 6049 uint8_t m_span; 6050 }; 6051 6052 SingleColorTableEntry g_singleColor5_3[256] = 6053 { 6054 { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 8, 0, 2, 8 }, { 8, 0, 2, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 8, 8, 8, 0 }, 6055 { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 16, 8, 10, 8 }, { 33, 0, 11, 33 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 16, 16, 16, 0 }, 6056 { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 24, 16, 18, 8 }, { 41, 8, 19, 33 }, { 16, 24, 21, 8 }, { 16, 24, 21, 8 }, { 0, 33, 22, 33 }, { 24, 24, 24, 0 }, 6057 { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 41, 24, 29, 17 }, { 24, 33, 30, 9 }, { 24, 33, 30, 9 }, 6058 { 16, 41, 32, 25 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 41, 33, 35, 8 }, { 41, 33, 35, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 }, 6059 { 24, 49, 40, 25 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 49, 41, 43, 8 }, { 66, 33, 44, 33 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 }, 6060 { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 57, 49, 51, 8 }, { 74, 41, 52, 33 }, { 49, 57, 54, 8 }, { 49, 57, 54, 8 }, { 33, 66, 55, 33 }, 6061 { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 74, 57, 62, 17 }, { 57, 66, 63, 9 }, 6062 { 57, 66, 63, 9 }, { 49, 74, 65, 25 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 74, 66, 68, 8 }, { 74, 66, 68, 8 }, { 66, 74, 71, 8 }, { 66, 74, 71, 8 }, 6063 { 66, 74, 71, 8 }, { 57, 82, 73, 25 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 82, 74, 76, 8 }, { 99, 66, 77, 33 }, { 74, 82, 79, 8 }, { 74, 82, 79, 8 }, 6064 { 74, 82, 79, 8 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 90, 82, 84, 8 }, { 107, 74, 85, 33 }, { 82, 90, 87, 8 }, { 82, 90, 87, 8 }, 6065 { 66, 99, 88, 33 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 107, 90, 95, 17 }, 6066 { 90, 99, 96, 9 }, { 90, 99, 96, 9 }, { 82, 107, 98, 25 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 107, 99, 101, 8 }, { 107, 99, 101, 8 }, { 99, 107, 104, 8 }, 6067 { 99, 107, 104, 8 }, { 99, 107, 104, 8 }, { 90, 115, 106, 25 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 115, 107, 109, 8 }, { 132, 99, 110, 33 }, { 107, 115, 112, 8 }, 6068 { 107, 115, 112, 8 }, { 107, 115, 112, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 123, 115, 117, 8 }, { 140, 107, 118, 33 }, { 115, 123, 120, 8 }, 6069 { 115, 123, 120, 8 }, { 99, 132, 121, 33 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 }, 6070 { 140, 123, 128, 17 }, { 123, 132, 129, 9 }, { 123, 132, 129, 9 }, { 115, 140, 131, 25 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 140, 132, 134, 8 }, { 140, 132, 134, 8 }, 6071 { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 123, 148, 139, 25 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 148, 140, 142, 8 }, { 165, 132, 143, 33 }, 6072 { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 156, 148, 150, 8 }, { 173, 140, 151, 33 }, 6073 { 148, 156, 153, 8 }, { 148, 156, 153, 8 }, { 132, 165, 154, 33 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 165, 156, 159, 9 }, { 165, 156, 159, 9 }, 6074 { 165, 156, 159, 9 }, { 173, 156, 161, 17 }, { 156, 165, 162, 9 }, { 156, 165, 162, 9 }, { 148, 173, 164, 25 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 173, 165, 167, 8 }, 6075 { 173, 165, 167, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 156, 181, 172, 25 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 181, 173, 175, 8 }, 6076 { 198, 165, 176, 33 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 189, 181, 183, 8 }, 6077 { 206, 173, 184, 33 }, { 181, 189, 186, 8 }, { 181, 189, 186, 8 }, { 165, 198, 187, 33 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 198, 189, 192, 9 }, 6078 { 198, 189, 192, 9 }, { 198, 189, 192, 9 }, { 206, 189, 194, 17 }, { 189, 198, 195, 9 }, { 189, 198, 195, 9 }, { 181, 206, 197, 25 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 }, 6079 { 206, 198, 200, 8 }, { 206, 198, 200, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 189, 214, 205, 25 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 }, 6080 { 214, 206, 208, 8 }, { 231, 198, 209, 33 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, 6081 { 222, 214, 216, 8 }, { 239, 206, 217, 33 }, { 214, 222, 219, 8 }, { 214, 222, 219, 8 }, { 198, 231, 220, 33 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, 6082 { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 239, 222, 227, 17 }, { 222, 231, 228, 9 }, { 222, 231, 228, 9 }, { 214, 239, 230, 25 }, { 231, 231, 231, 0 }, 6083 { 231, 231, 231, 0 }, { 239, 231, 233, 8 }, { 239, 231, 233, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 222, 247, 238, 25 }, { 239, 239, 239, 0 }, 6084 { 239, 239, 239, 0 }, { 247, 239, 241, 8 }, { 247, 239, 241, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, 6085 { 247, 247, 247, 0 }, { 255, 247, 249, 8 }, { 255, 247, 249, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, 6086 }; 6087 6088 SingleColorTableEntry g_singleColor6_3[256] = 6089 { 6090 { 0, 0, 0, 0 }, { 4, 0, 1, 4 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 8, 4, 5, 4 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 }, 6091 { 8, 8, 8, 0 }, { 12, 8, 9, 4 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 16, 12, 13, 4 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 }, 6092 { 16, 16, 16, 0 }, { 20, 16, 17, 4 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 24, 20, 21, 4 }, { 20, 24, 22, 4 }, { 69, 0, 23, 69 }, 6093 { 24, 24, 24, 0 }, { 28, 24, 25, 4 }, { 24, 28, 26, 4 }, { 65, 8, 27, 57 }, { 28, 28, 28, 0 }, { 32, 28, 29, 4 }, { 28, 32, 30, 4 }, { 69, 12, 31, 57 }, 6094 { 32, 32, 32, 0 }, { 36, 32, 33, 4 }, { 32, 36, 34, 4 }, { 65, 20, 35, 45 }, { 36, 36, 36, 0 }, { 40, 36, 37, 4 }, { 36, 40, 38, 4 }, { 69, 24, 39, 45 }, 6095 { 40, 40, 40, 0 }, { 44, 40, 41, 4 }, { 40, 44, 42, 4 }, { 65, 32, 43, 33 }, { 44, 44, 44, 0 }, { 48, 44, 45, 4 }, { 44, 48, 46, 4 }, { 69, 36, 47, 33 }, 6096 { 48, 48, 48, 0 }, { 52, 48, 49, 4 }, { 48, 52, 50, 4 }, { 65, 44, 51, 21 }, { 52, 52, 52, 0 }, { 56, 52, 53, 4 }, { 52, 56, 54, 4 }, { 69, 48, 55, 21 }, 6097 { 56, 56, 56, 0 }, { 60, 56, 57, 4 }, { 56, 60, 58, 4 }, { 65, 56, 59, 9 }, { 60, 60, 60, 0 }, { 65, 60, 61, 5 }, { 56, 65, 62, 9 }, { 60, 65, 63, 5 }, 6098 { 56, 69, 64, 13 }, { 65, 65, 65, 0 }, { 69, 65, 66, 4 }, { 65, 69, 67, 4 }, { 60, 73, 68, 13 }, { 69, 69, 69, 0 }, { 73, 69, 70, 4 }, { 69, 73, 71, 4 }, 6099 { 56, 81, 72, 25 }, { 73, 73, 73, 0 }, { 77, 73, 74, 4 }, { 73, 77, 75, 4 }, { 60, 85, 76, 25 }, { 77, 77, 77, 0 }, { 81, 77, 78, 4 }, { 77, 81, 79, 4 }, 6100 { 56, 93, 80, 37 }, { 81, 81, 81, 0 }, { 85, 81, 82, 4 }, { 81, 85, 83, 4 }, { 60, 97, 84, 37 }, { 85, 85, 85, 0 }, { 89, 85, 86, 4 }, { 85, 89, 87, 4 }, 6101 { 56, 105, 88, 49 }, { 89, 89, 89, 0 }, { 93, 89, 90, 4 }, { 89, 93, 91, 4 }, { 60, 109, 92, 49 }, { 93, 93, 93, 0 }, { 97, 93, 94, 4 }, { 93, 97, 95, 4 }, 6102 { 134, 77, 96, 57 }, { 97, 97, 97, 0 }, { 101, 97, 98, 4 }, { 97, 101, 99, 4 }, { 130, 85, 100, 45 }, { 101, 101, 101, 0 }, { 105, 101, 102, 4 }, { 101, 105, 103, 4 }, 6103 { 134, 89, 104, 45 }, { 105, 105, 105, 0 }, { 109, 105, 106, 4 }, { 105, 109, 107, 4 }, { 130, 97, 108, 33 }, { 109, 109, 109, 0 }, { 113, 109, 110, 4 }, { 109, 113, 111, 4 }, 6104 { 134, 101, 112, 33 }, { 113, 113, 113, 0 }, { 117, 113, 114, 4 }, { 113, 117, 115, 4 }, { 130, 109, 116, 21 }, { 117, 117, 117, 0 }, { 121, 117, 118, 4 }, { 117, 121, 119, 4 }, 6105 { 134, 113, 120, 21 }, { 121, 121, 121, 0 }, { 125, 121, 122, 4 }, { 121, 125, 123, 4 }, { 130, 121, 124, 9 }, { 125, 125, 125, 0 }, { 130, 125, 126, 5 }, { 121, 130, 127, 9 }, 6106 { 125, 130, 128, 5 }, { 121, 134, 129, 13 }, { 130, 130, 130, 0 }, { 134, 130, 131, 4 }, { 130, 134, 132, 4 }, { 125, 138, 133, 13 }, { 134, 134, 134, 0 }, { 138, 134, 135, 4 }, 6107 { 134, 138, 136, 4 }, { 121, 146, 137, 25 }, { 138, 138, 138, 0 }, { 142, 138, 139, 4 }, { 138, 142, 140, 4 }, { 125, 150, 141, 25 }, { 142, 142, 142, 0 }, { 146, 142, 143, 4 }, 6108 { 142, 146, 144, 4 }, { 121, 158, 145, 37 }, { 146, 146, 146, 0 }, { 150, 146, 147, 4 }, { 146, 150, 148, 4 }, { 125, 162, 149, 37 }, { 150, 150, 150, 0 }, { 154, 150, 151, 4 }, 6109 { 150, 154, 152, 4 }, { 121, 170, 153, 49 }, { 154, 154, 154, 0 }, { 158, 154, 155, 4 }, { 154, 158, 156, 4 }, { 125, 174, 157, 49 }, { 158, 158, 158, 0 }, { 162, 158, 159, 4 }, 6110 { 158, 162, 160, 4 }, { 199, 142, 161, 57 }, { 162, 162, 162, 0 }, { 166, 162, 163, 4 }, { 162, 166, 164, 4 }, { 195, 150, 165, 45 }, { 166, 166, 166, 0 }, { 170, 166, 167, 4 }, 6111 { 166, 170, 168, 4 }, { 199, 154, 169, 45 }, { 170, 170, 170, 0 }, { 174, 170, 171, 4 }, { 170, 174, 172, 4 }, { 195, 162, 173, 33 }, { 174, 174, 174, 0 }, { 178, 174, 175, 4 }, 6112 { 174, 178, 176, 4 }, { 199, 166, 177, 33 }, { 178, 178, 178, 0 }, { 182, 178, 179, 4 }, { 178, 182, 180, 4 }, { 195, 174, 181, 21 }, { 182, 182, 182, 0 }, { 186, 182, 183, 4 }, 6113 { 182, 186, 184, 4 }, { 199, 178, 185, 21 }, { 186, 186, 186, 0 }, { 190, 186, 187, 4 }, { 186, 190, 188, 4 }, { 195, 186, 189, 9 }, { 190, 190, 190, 0 }, { 195, 190, 191, 5 }, 6114 { 186, 195, 192, 9 }, { 190, 195, 193, 5 }, { 186, 199, 194, 13 }, { 195, 195, 195, 0 }, { 199, 195, 196, 4 }, { 195, 199, 197, 4 }, { 190, 203, 198, 13 }, { 199, 199, 199, 0 }, 6115 { 203, 199, 200, 4 }, { 199, 203, 201, 4 }, { 186, 211, 202, 25 }, { 203, 203, 203, 0 }, { 207, 203, 204, 4 }, { 203, 207, 205, 4 }, { 190, 215, 206, 25 }, { 207, 207, 207, 0 }, 6116 { 211, 207, 208, 4 }, { 207, 211, 209, 4 }, { 186, 223, 210, 37 }, { 211, 211, 211, 0 }, { 215, 211, 212, 4 }, { 211, 215, 213, 4 }, { 190, 227, 214, 37 }, { 215, 215, 215, 0 }, 6117 { 219, 215, 216, 4 }, { 215, 219, 217, 4 }, { 186, 235, 218, 49 }, { 219, 219, 219, 0 }, { 223, 219, 220, 4 }, { 219, 223, 221, 4 }, { 190, 239, 222, 49 }, { 223, 223, 223, 0 }, 6118 { 227, 223, 224, 4 }, { 223, 227, 225, 4 }, { 186, 247, 226, 61 }, { 227, 227, 227, 0 }, { 231, 227, 228, 4 }, { 227, 231, 229, 4 }, { 190, 251, 230, 61 }, { 231, 231, 231, 0 }, 6119 { 235, 231, 232, 4 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 239, 235, 236, 4 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, 6120 { 243, 239, 240, 4 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 247, 243, 244, 4 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, 6121 { 251, 247, 248, 4 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 255, 251, 252, 4 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, 6122 }; 6123 6124 SingleColorTableEntry g_singleColor5_2[256] = 6125 { 6126 { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, 6127 { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, 6128 { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, 6129 { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 41, 32, 17 }, 6130 { 24, 41, 32, 17 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 24, 49, 36, 25 }, { 24, 49, 36, 25 }, { 33, 41, 37, 8 }, { 33, 41, 37, 8 }, { 24, 57, 40, 33 }, 6131 { 24, 57, 40, 33 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 49, 49, 49, 0 }, 6132 { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 57, 57, 57, 0 }, 6133 { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, 6134 { 57, 74, 65, 17 }, { 57, 74, 65, 17 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 57, 82, 69, 25 }, { 57, 82, 69, 25 }, { 66, 74, 70, 8 }, { 66, 74, 70, 8 }, 6135 { 57, 90, 73, 33 }, { 57, 90, 73, 33 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, 6136 { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, 6137 { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, 6138 { 90, 99, 94, 9 }, { 90, 107, 98, 17 }, { 90, 107, 98, 17 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 90, 115, 102, 25 }, { 90, 115, 102, 25 }, { 99, 107, 103, 8 }, 6139 { 99, 107, 103, 8 }, { 90, 123, 106, 33 }, { 90, 123, 106, 33 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 111, 8 }, { 107, 115, 111, 8 }, 6140 { 107, 115, 111, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 119, 8 }, { 115, 123, 119, 8 }, 6141 { 115, 123, 119, 8 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, 6142 { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, { 123, 140, 131, 17 }, { 123, 140, 131, 17 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 123, 148, 135, 25 }, { 123, 148, 135, 25 }, 6143 { 132, 140, 136, 8 }, { 132, 140, 136, 8 }, { 123, 156, 139, 33 }, { 123, 156, 139, 33 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 144, 8 }, 6144 { 140, 148, 144, 8 }, { 140, 148, 144, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 152, 8 }, 6145 { 148, 156, 152, 8 }, { 148, 156, 152, 8 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 160, 9 }, 6146 { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 173, 164, 17 }, { 156, 173, 164, 17 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 156, 181, 168, 25 }, 6147 { 156, 181, 168, 25 }, { 165, 173, 169, 8 }, { 165, 173, 169, 8 }, { 156, 189, 172, 33 }, { 156, 189, 172, 33 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, 6148 { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, 6149 { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, 6150 { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 206, 197, 17 }, { 189, 206, 197, 17 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 }, 6151 { 189, 214, 201, 25 }, { 189, 214, 201, 25 }, { 198, 206, 202, 8 }, { 198, 206, 202, 8 }, { 189, 222, 205, 33 }, { 189, 222, 205, 33 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 }, 6152 { 206, 206, 206, 0 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, 6153 { 214, 214, 214, 0 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, 6154 { 222, 222, 222, 0 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 239, 230, 17 }, { 222, 239, 230, 17 }, { 231, 231, 231, 0 }, 6155 { 231, 231, 231, 0 }, { 222, 247, 234, 25 }, { 222, 247, 234, 25 }, { 231, 239, 235, 8 }, { 231, 239, 235, 8 }, { 222, 255, 238, 33 }, { 222, 255, 238, 33 }, { 239, 239, 239, 0 }, 6156 { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, 6157 { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, 6158 }; 6159 6160 SingleColorTableEntry g_singleColor6_2[256] = 6161 { 6162 { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 }, 6163 { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 }, 6164 { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 }, 6165 { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 }, 6166 { 32, 32, 32, 0 }, { 32, 32, 32, 0 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 }, 6167 { 40, 40, 40, 0 }, { 40, 40, 40, 0 }, { 40, 44, 42, 4 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 48, 46, 4 }, { 48, 48, 48, 0 }, 6168 { 48, 48, 48, 0 }, { 48, 48, 48, 0 }, { 48, 52, 50, 4 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 56, 54, 4 }, { 56, 56, 56, 0 }, 6169 { 56, 56, 56, 0 }, { 56, 56, 56, 0 }, { 56, 60, 58, 4 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 65, 62, 5 }, { 60, 65, 62, 5 }, 6170 { 60, 69, 64, 9 }, { 65, 65, 65, 0 }, { 60, 73, 66, 13 }, { 65, 69, 67, 4 }, { 60, 77, 68, 17 }, { 69, 69, 69, 0 }, { 60, 81, 70, 21 }, { 69, 73, 71, 4 }, 6171 { 60, 85, 72, 25 }, { 73, 73, 73, 0 }, { 60, 89, 74, 29 }, { 73, 77, 75, 4 }, { 60, 93, 76, 33 }, { 77, 77, 77, 0 }, { 60, 97, 78, 37 }, { 77, 81, 79, 4 }, 6172 { 60, 101, 80, 41 }, { 81, 81, 81, 0 }, { 60, 105, 82, 45 }, { 81, 85, 83, 4 }, { 60, 109, 84, 49 }, { 85, 85, 85, 0 }, { 60, 113, 86, 53 }, { 85, 89, 87, 4 }, 6173 { 60, 117, 88, 57 }, { 89, 89, 89, 0 }, { 60, 121, 90, 61 }, { 89, 93, 91, 4 }, { 60, 125, 92, 65 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 95, 4 }, 6174 { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 103, 4 }, 6175 { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 107, 4 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 113, 111, 4 }, 6176 { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 117, 115, 4 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 121, 119, 4 }, 6177 { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 125, 123, 4 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 130, 127, 5 }, 6178 { 125, 130, 127, 5 }, { 125, 134, 129, 9 }, { 130, 130, 130, 0 }, { 125, 138, 131, 13 }, { 130, 134, 132, 4 }, { 125, 142, 133, 17 }, { 134, 134, 134, 0 }, { 125, 146, 135, 21 }, 6179 { 134, 138, 136, 4 }, { 125, 150, 137, 25 }, { 138, 138, 138, 0 }, { 125, 154, 139, 29 }, { 138, 142, 140, 4 }, { 125, 158, 141, 33 }, { 142, 142, 142, 0 }, { 125, 162, 143, 37 }, 6180 { 142, 146, 144, 4 }, { 125, 166, 145, 41 }, { 146, 146, 146, 0 }, { 125, 170, 147, 45 }, { 146, 150, 148, 4 }, { 125, 174, 149, 49 }, { 150, 150, 150, 0 }, { 125, 178, 151, 53 }, 6181 { 150, 154, 152, 4 }, { 125, 182, 153, 57 }, { 154, 154, 154, 0 }, { 125, 186, 155, 61 }, { 154, 158, 156, 4 }, { 125, 190, 157, 65 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, 6182 { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, 6183 { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 172, 4 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, 6184 { 174, 178, 176, 4 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 182, 180, 4 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, 6185 { 182, 186, 184, 4 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 190, 188, 4 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, 6186 { 190, 195, 192, 5 }, { 190, 195, 192, 5 }, { 190, 199, 194, 9 }, { 195, 195, 195, 0 }, { 190, 203, 196, 13 }, { 195, 199, 197, 4 }, { 190, 207, 198, 17 }, { 199, 199, 199, 0 }, 6187 { 190, 211, 200, 21 }, { 199, 203, 201, 4 }, { 190, 215, 202, 25 }, { 203, 203, 203, 0 }, { 190, 219, 204, 29 }, { 203, 207, 205, 4 }, { 190, 223, 206, 33 }, { 207, 207, 207, 0 }, 6188 { 190, 227, 208, 37 }, { 207, 211, 209, 4 }, { 190, 231, 210, 41 }, { 211, 211, 211, 0 }, { 190, 235, 212, 45 }, { 211, 215, 213, 4 }, { 190, 239, 214, 49 }, { 215, 215, 215, 0 }, 6189 { 190, 243, 216, 53 }, { 215, 219, 217, 4 }, { 190, 247, 218, 57 }, { 219, 219, 219, 0 }, { 190, 251, 220, 61 }, { 219, 223, 221, 4 }, { 190, 255, 222, 65 }, { 223, 223, 223, 0 }, 6190 { 223, 223, 223, 0 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 }, 6191 { 231, 231, 231, 0 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, 6192 { 239, 239, 239, 0 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, 6193 { 247, 247, 247, 0 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, 6194 }; 6195 6196 SingleColorTableEntry g_singleColor5_3_p[256] = 6197 { 6198 { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 8, 0, 2, 8 }, { 8, 0, 2, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 8, 8, 8, 0 }, 6199 { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 16, 8, 10, 8 }, { 33, 0, 11, 33 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 16, 16, 16, 0 }, 6200 { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 24, 16, 18, 8 }, { 41, 8, 19, 33 }, { 16, 24, 21, 8 }, { 16, 24, 21, 8 }, { 0, 33, 22, 33 }, { 24, 24, 24, 0 }, 6201 { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 41, 24, 29, 17 }, { 24, 33, 30, 9 }, { 24, 33, 30, 9 }, 6202 { 16, 41, 32, 25 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 41, 33, 35, 8 }, { 41, 33, 35, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 }, 6203 { 24, 49, 40, 25 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 49, 41, 43, 8 }, { 66, 33, 44, 33 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 }, 6204 { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 57, 49, 51, 8 }, { 74, 41, 52, 33 }, { 49, 57, 54, 8 }, { 49, 57, 54, 8 }, { 33, 66, 55, 33 }, 6205 { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 74, 57, 62, 17 }, { 57, 66, 63, 9 }, 6206 { 57, 66, 63, 9 }, { 49, 74, 65, 25 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 74, 66, 68, 8 }, { 74, 66, 68, 8 }, { 66, 74, 71, 8 }, { 66, 74, 71, 8 }, 6207 { 66, 74, 71, 8 }, { 57, 82, 73, 25 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 82, 74, 76, 8 }, { 99, 66, 77, 33 }, { 74, 82, 79, 8 }, { 74, 82, 79, 8 }, 6208 { 74, 82, 79, 8 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 90, 82, 84, 8 }, { 107, 74, 85, 33 }, { 82, 90, 87, 8 }, { 82, 90, 87, 8 }, 6209 { 66, 99, 88, 33 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 107, 90, 95, 17 }, 6210 { 90, 99, 96, 9 }, { 90, 99, 96, 9 }, { 82, 107, 98, 25 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 107, 99, 101, 8 }, { 107, 99, 101, 8 }, { 99, 107, 104, 8 }, 6211 { 99, 107, 104, 8 }, { 99, 107, 104, 8 }, { 90, 115, 106, 25 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 115, 107, 109, 8 }, { 132, 99, 110, 33 }, { 107, 115, 112, 8 }, 6212 { 107, 115, 112, 8 }, { 107, 115, 112, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 123, 115, 117, 8 }, { 140, 107, 118, 33 }, { 115, 123, 120, 8 }, 6213 { 115, 123, 120, 8 }, { 99, 132, 121, 33 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 }, 6214 { 140, 123, 128, 17 }, { 123, 132, 129, 9 }, { 123, 132, 129, 9 }, { 115, 140, 131, 25 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 140, 132, 134, 8 }, { 140, 132, 134, 8 }, 6215 { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 123, 148, 139, 25 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 148, 140, 142, 8 }, { 165, 132, 143, 33 }, 6216 { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 156, 148, 150, 8 }, { 173, 140, 151, 33 }, 6217 { 148, 156, 153, 8 }, { 148, 156, 153, 8 }, { 132, 165, 154, 33 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 165, 156, 159, 9 }, { 165, 156, 159, 9 }, 6218 { 165, 156, 159, 9 }, { 173, 156, 161, 17 }, { 156, 165, 162, 9 }, { 156, 165, 162, 9 }, { 148, 173, 164, 25 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 173, 165, 167, 8 }, 6219 { 173, 165, 167, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 156, 181, 172, 25 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 181, 173, 175, 8 }, 6220 { 198, 165, 176, 33 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 189, 181, 183, 8 }, 6221 { 206, 173, 184, 33 }, { 181, 189, 186, 8 }, { 181, 189, 186, 8 }, { 165, 198, 187, 33 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 198, 189, 192, 9 }, 6222 { 198, 189, 192, 9 }, { 198, 189, 192, 9 }, { 206, 189, 194, 17 }, { 189, 198, 195, 9 }, { 189, 198, 195, 9 }, { 181, 206, 197, 25 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 }, 6223 { 206, 198, 200, 8 }, { 206, 198, 200, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 189, 214, 205, 25 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 }, 6224 { 214, 206, 208, 8 }, { 231, 198, 209, 33 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, 6225 { 222, 214, 216, 8 }, { 239, 206, 217, 33 }, { 214, 222, 219, 8 }, { 214, 222, 219, 8 }, { 198, 231, 220, 33 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, 6226 { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 239, 222, 227, 17 }, { 222, 231, 228, 9 }, { 222, 231, 228, 9 }, { 214, 239, 230, 25 }, { 231, 231, 231, 0 }, 6227 { 231, 231, 231, 0 }, { 239, 231, 233, 8 }, { 239, 231, 233, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 222, 247, 238, 25 }, { 239, 239, 239, 0 }, 6228 { 239, 239, 239, 0 }, { 247, 239, 241, 8 }, { 247, 239, 241, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, 6229 { 247, 247, 247, 0 }, { 255, 247, 249, 8 }, { 255, 247, 249, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, 6230 }; 6231 6232 SingleColorTableEntry g_singleColor6_3_p[256] = 6233 { 6234 { 0, 0, 0, 0 }, { 4, 0, 1, 4 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 8, 4, 5, 4 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 }, 6235 { 8, 8, 8, 0 }, { 12, 8, 9, 4 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 16, 12, 13, 4 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 }, 6236 { 16, 16, 16, 0 }, { 20, 16, 17, 4 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 24, 20, 21, 4 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 }, 6237 { 24, 24, 24, 0 }, { 28, 24, 25, 4 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 32, 28, 29, 4 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 }, 6238 { 32, 32, 32, 0 }, { 36, 32, 33, 4 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 40, 36, 37, 4 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 }, 6239 { 40, 40, 40, 0 }, { 44, 40, 41, 4 }, { 40, 44, 42, 4 }, { 65, 32, 43, 33 }, { 44, 44, 44, 0 }, { 48, 44, 45, 4 }, { 44, 48, 46, 4 }, { 69, 36, 47, 33 }, 6240 { 48, 48, 48, 0 }, { 52, 48, 49, 4 }, { 48, 52, 50, 4 }, { 65, 44, 51, 21 }, { 52, 52, 52, 0 }, { 56, 52, 53, 4 }, { 52, 56, 54, 4 }, { 69, 48, 55, 21 }, 6241 { 56, 56, 56, 0 }, { 60, 56, 57, 4 }, { 56, 60, 58, 4 }, { 65, 56, 59, 9 }, { 60, 60, 60, 0 }, { 65, 60, 61, 5 }, { 56, 65, 62, 9 }, { 60, 65, 63, 5 }, 6242 { 56, 69, 64, 13 }, { 65, 65, 65, 0 }, { 69, 65, 66, 4 }, { 65, 69, 67, 4 }, { 60, 73, 68, 13 }, { 69, 69, 69, 0 }, { 73, 69, 70, 4 }, { 69, 73, 71, 4 }, 6243 { 56, 81, 72, 25 }, { 73, 73, 73, 0 }, { 77, 73, 74, 4 }, { 73, 77, 75, 4 }, { 60, 85, 76, 25 }, { 77, 77, 77, 0 }, { 81, 77, 78, 4 }, { 77, 81, 79, 4 }, 6244 { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 85, 81, 82, 4 }, { 81, 85, 83, 4 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 89, 85, 86, 4 }, { 85, 89, 87, 4 }, 6245 { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 93, 89, 90, 4 }, { 89, 93, 91, 4 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 97, 93, 94, 4 }, { 93, 97, 95, 4 }, 6246 { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 101, 97, 98, 4 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 105, 101, 102, 4 }, { 101, 105, 103, 4 }, 6247 { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 109, 105, 106, 4 }, { 105, 109, 107, 4 }, { 130, 97, 108, 33 }, { 109, 109, 109, 0 }, { 113, 109, 110, 4 }, { 109, 113, 111, 4 }, 6248 { 134, 101, 112, 33 }, { 113, 113, 113, 0 }, { 117, 113, 114, 4 }, { 113, 117, 115, 4 }, { 130, 109, 116, 21 }, { 117, 117, 117, 0 }, { 121, 117, 118, 4 }, { 117, 121, 119, 4 }, 6249 { 134, 113, 120, 21 }, { 121, 121, 121, 0 }, { 125, 121, 122, 4 }, { 121, 125, 123, 4 }, { 130, 121, 124, 9 }, { 125, 125, 125, 0 }, { 130, 125, 126, 5 }, { 121, 130, 127, 9 }, 6250 { 125, 130, 128, 5 }, { 121, 134, 129, 13 }, { 130, 130, 130, 0 }, { 134, 130, 131, 4 }, { 130, 134, 132, 4 }, { 125, 138, 133, 13 }, { 134, 134, 134, 0 }, { 138, 134, 135, 4 }, 6251 { 134, 138, 136, 4 }, { 121, 146, 137, 25 }, { 138, 138, 138, 0 }, { 142, 138, 139, 4 }, { 138, 142, 140, 4 }, { 125, 150, 141, 25 }, { 142, 142, 142, 0 }, { 146, 142, 143, 4 }, 6252 { 142, 146, 144, 4 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 150, 146, 147, 4 }, { 146, 150, 148, 4 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, { 154, 150, 151, 4 }, 6253 { 150, 154, 152, 4 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 158, 154, 155, 4 }, { 154, 158, 156, 4 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, { 162, 158, 159, 4 }, 6254 { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 166, 162, 163, 4 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 170, 166, 167, 4 }, 6255 { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 174, 170, 171, 4 }, { 170, 174, 172, 4 }, { 195, 162, 173, 33 }, { 174, 174, 174, 0 }, { 178, 174, 175, 4 }, 6256 { 174, 178, 176, 4 }, { 199, 166, 177, 33 }, { 178, 178, 178, 0 }, { 182, 178, 179, 4 }, { 178, 182, 180, 4 }, { 195, 174, 181, 21 }, { 182, 182, 182, 0 }, { 186, 182, 183, 4 }, 6257 { 182, 186, 184, 4 }, { 199, 178, 185, 21 }, { 186, 186, 186, 0 }, { 190, 186, 187, 4 }, { 186, 190, 188, 4 }, { 195, 186, 189, 9 }, { 190, 190, 190, 0 }, { 195, 190, 191, 5 }, 6258 { 186, 195, 192, 9 }, { 190, 195, 193, 5 }, { 186, 199, 194, 13 }, { 195, 195, 195, 0 }, { 199, 195, 196, 4 }, { 195, 199, 197, 4 }, { 190, 203, 198, 13 }, { 199, 199, 199, 0 }, 6259 { 203, 199, 200, 4 }, { 199, 203, 201, 4 }, { 186, 211, 202, 25 }, { 203, 203, 203, 0 }, { 207, 203, 204, 4 }, { 203, 207, 205, 4 }, { 190, 215, 206, 25 }, { 207, 207, 207, 0 }, 6260 { 211, 207, 208, 4 }, { 207, 211, 209, 4 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 215, 211, 212, 4 }, { 211, 215, 213, 4 }, { 215, 215, 215, 0 }, { 215, 215, 215, 0 }, 6261 { 219, 215, 216, 4 }, { 215, 219, 217, 4 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 223, 219, 220, 4 }, { 219, 223, 221, 4 }, { 223, 223, 223, 0 }, { 223, 223, 223, 0 }, 6262 { 227, 223, 224, 4 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 231, 227, 228, 4 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 }, 6263 { 235, 231, 232, 4 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 239, 235, 236, 4 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, 6264 { 243, 239, 240, 4 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 247, 243, 244, 4 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, 6265 { 251, 247, 248, 4 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 255, 251, 252, 4 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, 6266 }; 6267 6268 SingleColorTableEntry g_singleColor5_2_p[256] = 6269 { 6270 { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, 6271 { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, 6272 { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, 6273 { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 41, 32, 17 }, 6274 { 24, 41, 32, 17 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 24, 49, 36, 25 }, { 24, 49, 36, 25 }, { 33, 41, 37, 8 }, { 33, 41, 37, 8 }, { 24, 57, 40, 33 }, 6275 { 24, 57, 40, 33 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 49, 49, 49, 0 }, 6276 { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 57, 57, 57, 0 }, 6277 { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, 6278 { 57, 74, 65, 17 }, { 57, 74, 65, 17 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 57, 82, 69, 25 }, { 57, 82, 69, 25 }, { 66, 74, 70, 8 }, { 66, 74, 70, 8 }, 6279 { 57, 90, 73, 33 }, { 57, 90, 73, 33 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, 6280 { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, 6281 { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, 6282 { 90, 99, 94, 9 }, { 90, 107, 98, 17 }, { 90, 107, 98, 17 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 90, 115, 102, 25 }, { 90, 115, 102, 25 }, { 99, 107, 103, 8 }, 6283 { 99, 107, 103, 8 }, { 90, 123, 106, 33 }, { 90, 123, 106, 33 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 111, 8 }, { 107, 115, 111, 8 }, 6284 { 107, 115, 111, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 119, 8 }, { 115, 123, 119, 8 }, 6285 { 115, 123, 119, 8 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, 6286 { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, { 123, 140, 131, 17 }, { 123, 140, 131, 17 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 123, 148, 135, 25 }, { 123, 148, 135, 25 }, 6287 { 132, 140, 136, 8 }, { 132, 140, 136, 8 }, { 123, 156, 139, 33 }, { 123, 156, 139, 33 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 144, 8 }, 6288 { 140, 148, 144, 8 }, { 140, 148, 144, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 152, 8 }, 6289 { 148, 156, 152, 8 }, { 148, 156, 152, 8 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 160, 9 }, 6290 { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 173, 164, 17 }, { 156, 173, 164, 17 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 156, 181, 168, 25 }, 6291 { 156, 181, 168, 25 }, { 165, 173, 169, 8 }, { 165, 173, 169, 8 }, { 156, 189, 172, 33 }, { 156, 189, 172, 33 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, 6292 { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, 6293 { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, 6294 { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 206, 197, 17 }, { 189, 206, 197, 17 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 }, 6295 { 189, 214, 201, 25 }, { 189, 214, 201, 25 }, { 198, 206, 202, 8 }, { 198, 206, 202, 8 }, { 189, 222, 205, 33 }, { 189, 222, 205, 33 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 }, 6296 { 206, 206, 206, 0 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, 6297 { 214, 214, 214, 0 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, 6298 { 222, 222, 222, 0 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 239, 230, 17 }, { 222, 239, 230, 17 }, { 231, 231, 231, 0 }, 6299 { 231, 231, 231, 0 }, { 222, 247, 234, 25 }, { 222, 247, 234, 25 }, { 231, 239, 235, 8 }, { 231, 239, 235, 8 }, { 222, 255, 238, 33 }, { 222, 255, 238, 33 }, { 239, 239, 239, 0 }, 6300 { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, 6301 { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, 6302 }; 6303 6304 SingleColorTableEntry g_singleColor6_2_p[256] = 6305 { 6306 { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 }, 6307 { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 }, 6308 { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 }, 6309 { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 }, 6310 { 32, 32, 32, 0 }, { 32, 32, 32, 0 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 }, 6311 { 40, 40, 40, 0 }, { 40, 40, 40, 0 }, { 40, 44, 42, 4 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 48, 46, 4 }, { 48, 48, 48, 0 }, 6312 { 48, 48, 48, 0 }, { 48, 48, 48, 0 }, { 48, 52, 50, 4 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 56, 54, 4 }, { 56, 56, 56, 0 }, 6313 { 56, 56, 56, 0 }, { 56, 56, 56, 0 }, { 56, 60, 58, 4 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 65, 62, 5 }, { 60, 65, 62, 5 }, 6314 { 60, 69, 64, 9 }, { 65, 65, 65, 0 }, { 60, 73, 66, 13 }, { 65, 69, 67, 4 }, { 60, 77, 68, 17 }, { 69, 69, 69, 0 }, { 60, 81, 70, 21 }, { 69, 73, 71, 4 }, 6315 { 60, 85, 72, 25 }, { 73, 73, 73, 0 }, { 60, 89, 74, 29 }, { 73, 77, 75, 4 }, { 60, 93, 76, 33 }, { 77, 77, 77, 0 }, { 77, 77, 77, 0 }, { 77, 81, 79, 4 }, 6316 { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 85, 83, 4 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 89, 87, 4 }, 6317 { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 93, 91, 4 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 95, 4 }, 6318 { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 103, 4 }, 6319 { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 107, 4 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 113, 111, 4 }, 6320 { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 117, 115, 4 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 121, 119, 4 }, 6321 { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 125, 123, 4 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 130, 127, 5 }, 6322 { 125, 130, 127, 5 }, { 125, 134, 129, 9 }, { 130, 130, 130, 0 }, { 125, 138, 131, 13 }, { 130, 134, 132, 4 }, { 125, 142, 133, 17 }, { 134, 134, 134, 0 }, { 125, 146, 135, 21 }, 6323 { 134, 138, 136, 4 }, { 125, 150, 137, 25 }, { 138, 138, 138, 0 }, { 125, 154, 139, 29 }, { 138, 142, 140, 4 }, { 125, 158, 141, 33 }, { 142, 142, 142, 0 }, { 142, 142, 142, 0 }, 6324 { 142, 146, 144, 4 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 150, 148, 4 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, 6325 { 150, 154, 152, 4 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 158, 156, 4 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, 6326 { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, 6327 { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 172, 4 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, 6328 { 174, 178, 176, 4 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 182, 180, 4 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, 6329 { 182, 186, 184, 4 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 190, 188, 4 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, 6330 { 190, 195, 192, 5 }, { 190, 195, 192, 5 }, { 190, 199, 194, 9 }, { 195, 195, 195, 0 }, { 190, 203, 196, 13 }, { 195, 199, 197, 4 }, { 190, 207, 198, 17 }, { 199, 199, 199, 0 }, 6331 { 190, 211, 200, 21 }, { 199, 203, 201, 4 }, { 190, 215, 202, 25 }, { 203, 203, 203, 0 }, { 190, 219, 204, 29 }, { 203, 207, 205, 4 }, { 190, 223, 206, 33 }, { 207, 207, 207, 0 }, 6332 { 207, 207, 207, 0 }, { 207, 211, 209, 4 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 215, 213, 4 }, { 215, 215, 215, 0 }, { 215, 215, 215, 0 }, 6333 { 215, 215, 215, 0 }, { 215, 219, 217, 4 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 223, 221, 4 }, { 223, 223, 223, 0 }, { 223, 223, 223, 0 }, 6334 { 223, 223, 223, 0 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 }, 6335 { 231, 231, 231, 0 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, 6336 { 239, 239, 239, 0 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, 6337 { 247, 247, 247, 0 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, 6338 }; 6339 } 6340 6341 class S3TCComputer 6342 { 6343 public: 6344 typedef ParallelMath::Float MFloat; 6345 typedef ParallelMath::SInt16 MSInt16; 6346 typedef ParallelMath::UInt15 MUInt15; 6347 typedef ParallelMath::UInt16 MUInt16; 6348 typedef ParallelMath::SInt32 MSInt32; 6349 Init(MFloat & error)6350 static void Init(MFloat& error) 6351 { 6352 error = ParallelMath::MakeFloat(FLT_MAX); 6353 } 6354 QuantizeTo6Bits(MUInt15 & v)6355 static void QuantizeTo6Bits(MUInt15& v) 6356 { 6357 MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(253)) + ParallelMath::MakeUInt16(512), 10)); 6358 v = (reduced << 2) | ParallelMath::RightShift(reduced, 4); 6359 } 6360 QuantizeTo5Bits(MUInt15 & v)6361 static void QuantizeTo5Bits(MUInt15& v) 6362 { 6363 MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(249)) + ParallelMath::MakeUInt16(1024), 11)); 6364 v = (reduced << 3) | ParallelMath::RightShift(reduced, 2); 6365 } 6366 QuantizeTo565(MUInt15 endPoint[3])6367 static void QuantizeTo565(MUInt15 endPoint[3]) 6368 { 6369 QuantizeTo5Bits(endPoint[0]); 6370 QuantizeTo6Bits(endPoint[1]); 6371 QuantizeTo5Bits(endPoint[2]); 6372 } 6373 ParanoidFactorForSpan(const MSInt16 & span)6374 static MFloat ParanoidFactorForSpan(const MSInt16& span) 6375 { 6376 return ParallelMath::Abs(ParallelMath::ToFloat(span)) * 0.03f; 6377 } 6378 ParanoidDiff(const MUInt15 & a,const MUInt15 & b,const MFloat & d)6379 static MFloat ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d) 6380 { 6381 MFloat absDiff = ParallelMath::Abs(ParallelMath::ToFloat(ParallelMath::LosslessCast<MSInt16>::Cast(a) - ParallelMath::LosslessCast<MSInt16>::Cast(b))); 6382 absDiff = absDiff + d; 6383 return absDiff * absDiff; 6384 } 6385 TestSingleColor(uint32_t flags,const MUInt15 pixels[16][4],const MFloat floatPixels[16][4],int range,const float * channelWeights,MFloat & bestError,MUInt15 bestEndpoints[2][3],MUInt15 bestIndexes[16],MUInt15 & bestRange,const ParallelMath::RoundTowardNearestForScope * rtn)6386 static void TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights, 6387 MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn) 6388 { 6389 float channelWeightsSq[3]; 6390 6391 for (int ch = 0; ch < 3; ch++) 6392 channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch]; 6393 6394 MUInt15 totals[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) }; 6395 6396 for (int px = 0; px < 16; px++) 6397 { 6398 for (int ch = 0; ch < 3; ch++) 6399 totals[ch] = totals[ch] + pixels[px][ch]; 6400 } 6401 6402 MUInt15 average[3]; 6403 for (int ch = 0; ch < 3; ch++) 6404 average[ch] = ParallelMath::RightShift(totals[ch] + ParallelMath::MakeUInt15(8), 4); 6405 6406 const S3TCSingleColorTables::SingleColorTableEntry* rbTable = NULL; 6407 const S3TCSingleColorTables::SingleColorTableEntry* gTable = NULL; 6408 if (flags & cvtt::Flags::S3TC_Paranoid) 6409 { 6410 if (range == 4) 6411 { 6412 rbTable = S3TCSingleColorTables::g_singleColor5_3_p; 6413 gTable = S3TCSingleColorTables::g_singleColor6_3_p; 6414 } 6415 else 6416 { 6417 assert(range == 3); 6418 rbTable = S3TCSingleColorTables::g_singleColor5_2_p; 6419 gTable = S3TCSingleColorTables::g_singleColor6_2_p; 6420 } 6421 } 6422 else 6423 { 6424 if (range == 4) 6425 { 6426 rbTable = S3TCSingleColorTables::g_singleColor5_3; 6427 gTable = S3TCSingleColorTables::g_singleColor6_3; 6428 } 6429 else 6430 { 6431 assert(range == 3); 6432 rbTable = S3TCSingleColorTables::g_singleColor5_2; 6433 gTable = S3TCSingleColorTables::g_singleColor6_2; 6434 } 6435 } 6436 6437 MUInt15 interpolated[3]; 6438 MUInt15 eps[2][3]; 6439 MSInt16 spans[3]; 6440 for (int i = 0; i < ParallelMath::ParallelSize; i++) 6441 { 6442 for (int ch = 0; ch < 3; ch++) 6443 { 6444 uint16_t avg = ParallelMath::Extract(average[ch], i); 6445 const S3TCSingleColorTables::SingleColorTableEntry& tableEntry = ((ch == 1) ? gTable[avg] : rbTable[avg]); 6446 ParallelMath::PutUInt15(eps[0][ch], i, tableEntry.m_min); 6447 ParallelMath::PutUInt15(eps[1][ch], i, tableEntry.m_max); 6448 ParallelMath::PutUInt15(interpolated[ch], i, tableEntry.m_actualColor); 6449 ParallelMath::PutSInt16(spans[ch], i, tableEntry.m_span); 6450 } 6451 } 6452 6453 MFloat error = ParallelMath::MakeFloatZero(); 6454 if (flags & cvtt::Flags::S3TC_Paranoid) 6455 { 6456 MFloat spanParanoidFactors[3]; 6457 for (int ch = 0; ch < 3; ch++) 6458 spanParanoidFactors[ch] = ParanoidFactorForSpan(spans[ch]); 6459 6460 for (int px = 0; px < 16; px++) 6461 { 6462 for (int ch = 0; ch < 3; ch++) 6463 error = error + ParanoidDiff(interpolated[ch], pixels[px][ch], spanParanoidFactors[ch]) * channelWeightsSq[ch]; 6464 } 6465 } 6466 else 6467 { 6468 for (int px = 0; px < 16; px++) 6469 { 6470 for (int ch = 0; ch < 3; ch++) 6471 error = error + ParallelMath::ToFloat(ParallelMath::SqDiffUInt8(interpolated[ch], pixels[px][ch])) * channelWeightsSq[ch]; 6472 } 6473 } 6474 6475 ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError); 6476 ParallelMath::Int16CompFlag better16 = ParallelMath::FloatFlagToInt16(better); 6477 6478 if (ParallelMath::AnySet(better16)) 6479 { 6480 bestError = ParallelMath::Min(bestError, error); 6481 for (int epi = 0; epi < 2; epi++) 6482 for (int ch = 0; ch < 3; ch++) 6483 ParallelMath::ConditionalSet(bestEndpoints[epi][ch], better16, eps[epi][ch]); 6484 6485 MUInt15 vindexes = ParallelMath::MakeUInt15(1); 6486 for (int px = 0; px < 16; px++) 6487 ParallelMath::ConditionalSet(bestIndexes[px], better16, vindexes); 6488 6489 ParallelMath::ConditionalSet(bestRange, better16, ParallelMath::MakeUInt15(range)); 6490 } 6491 } 6492 TestEndpoints(uint32_t flags,const MUInt15 pixels[16][4],const MFloat floatPixels[16][4],const MFloat preWeightedPixels[16][4],const MUInt15 unquantizedEndPoints[2][3],int range,const float * channelWeights,MFloat & bestError,MUInt15 bestEndpoints[2][3],MUInt15 bestIndexes[16],MUInt15 & bestRange,EndpointRefiner<3> * refiner,const ParallelMath::RoundTowardNearestForScope * rtn)6493 static void TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights, 6494 MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn) 6495 { 6496 float channelWeightsSq[3]; 6497 6498 for (int ch = 0; ch < 3; ch++) 6499 channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch]; 6500 6501 MUInt15 endPoints[2][3]; 6502 6503 for (int ep = 0; ep < 2; ep++) 6504 for (int ch = 0; ch < 3; ch++) 6505 endPoints[ep][ch] = unquantizedEndPoints[ep][ch]; 6506 6507 QuantizeTo565(endPoints[0]); 6508 QuantizeTo565(endPoints[1]); 6509 6510 IndexSelector<3> selector; 6511 selector.Init<false>(channelWeights, endPoints, range); 6512 6513 MUInt15 indexes[16]; 6514 6515 MFloat paranoidFactors[3]; 6516 for (int ch = 0; ch < 3; ch++) 6517 paranoidFactors[ch] = ParanoidFactorForSpan(ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[0][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[1][ch])); 6518 6519 MFloat error = ParallelMath::MakeFloatZero(); 6520 AggregatedError<3> aggError; 6521 for (int px = 0; px < 16; px++) 6522 { 6523 MUInt15 index = selector.SelectIndexLDR(floatPixels[px], rtn); 6524 indexes[px] = index; 6525 6526 if (refiner) 6527 refiner->ContributeUnweightedPW(preWeightedPixels[px], index); 6528 6529 MUInt15 reconstructed[3]; 6530 selector.ReconstructLDRPrecise(index, reconstructed); 6531 6532 if (flags & Flags::S3TC_Paranoid) 6533 { 6534 for (int ch = 0; ch < 3; ch++) 6535 error = error + ParanoidDiff(reconstructed[ch], pixels[px][ch], paranoidFactors[ch]) * channelWeightsSq[ch]; 6536 } 6537 else 6538 BCCommon::ComputeErrorLDR<3>(flags, reconstructed, pixels[px], aggError); 6539 } 6540 6541 if (!(flags & Flags::S3TC_Paranoid)) 6542 error = aggError.Finalize(flags, channelWeightsSq); 6543 6544 ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError); 6545 6546 if (ParallelMath::AnySet(better)) 6547 { 6548 ParallelMath::Int16CompFlag betterInt16 = ParallelMath::FloatFlagToInt16(better); 6549 6550 ParallelMath::ConditionalSet(bestError, better, error); 6551 6552 for (int ep = 0; ep < 2; ep++) 6553 for (int ch = 0; ch < 3; ch++) 6554 ParallelMath::ConditionalSet(bestEndpoints[ep][ch], betterInt16, endPoints[ep][ch]); 6555 6556 for (int px = 0; px < 16; px++) 6557 ParallelMath::ConditionalSet(bestIndexes[px], betterInt16, indexes[px]); 6558 6559 ParallelMath::ConditionalSet(bestRange, betterInt16, ParallelMath::MakeUInt15(static_cast<uint16_t>(range))); 6560 } 6561 } 6562 TestCounts(uint32_t flags,const int * counts,int nCounts,const MUInt15 & numElements,const MUInt15 pixels[16][4],const MFloat floatPixels[16][4],const MFloat preWeightedPixels[16][4],bool alphaTest,const MFloat floatSortedInputs[16][4],const MFloat preWeightedFloatSortedInputs[16][4],const float * channelWeights,MFloat & bestError,MUInt15 bestEndpoints[2][3],MUInt15 bestIndexes[16],MUInt15 & bestRange,const ParallelMath::RoundTowardNearestForScope * rtn)6563 static void TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest, 6564 const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, 6565 const ParallelMath::RoundTowardNearestForScope* rtn) 6566 { 6567 UNREFERENCED_PARAMETER(alphaTest); 6568 UNREFERENCED_PARAMETER(flags); 6569 6570 EndpointRefiner<3> refiner; 6571 6572 refiner.Init(nCounts, channelWeights); 6573 6574 bool escape = false; 6575 int e = 0; 6576 for (int i = 0; i < nCounts; i++) 6577 { 6578 for (int n = 0; n < counts[i]; n++) 6579 { 6580 ParallelMath::Int16CompFlag valid = ParallelMath::Less(ParallelMath::MakeUInt15(static_cast<uint16_t>(n)), numElements); 6581 if (!ParallelMath::AnySet(valid)) 6582 { 6583 escape = true; 6584 break; 6585 } 6586 6587 if (ParallelMath::AllSet(valid)) 6588 refiner.ContributeUnweightedPW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i))); 6589 else 6590 { 6591 MFloat weight = ParallelMath::Select(ParallelMath::Int16FlagToFloat(valid), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(0.0f)); 6592 refiner.ContributePW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), weight); 6593 } 6594 } 6595 6596 if (escape) 6597 break; 6598 } 6599 6600 MUInt15 endPoints[2][3]; 6601 refiner.GetRefinedEndpointsLDR(endPoints, rtn); 6602 6603 TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn); 6604 } 6605 PackExplicitAlpha(uint32_t flags,const PixelBlockU8 * inputs,int inputChannel,uint8_t * packedBlocks,size_t packedBlockStride)6606 static void PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride) 6607 { 6608 UNREFERENCED_PARAMETER(flags); 6609 ParallelMath::RoundTowardNearestForScope rtn; 6610 6611 float weights[1] = { 1.0f }; 6612 6613 MUInt15 pixels[16]; 6614 MFloat floatPixels[16]; 6615 6616 for (int px = 0; px < 16; px++) 6617 { 6618 ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]); 6619 floatPixels[px] = ParallelMath::ToFloat(pixels[px]); 6620 } 6621 6622 MUInt15 ep[2][1] = { { ParallelMath::MakeUInt15(0) },{ ParallelMath::MakeUInt15(255) } }; 6623 6624 IndexSelector<1> selector; 6625 selector.Init<false>(weights, ep, 16); 6626 6627 MUInt15 indexes[16]; 6628 6629 for (int px = 0; px < 16; px++) 6630 indexes[px] = selector.SelectIndexLDR(&floatPixels[px], &rtn); 6631 6632 for (int block = 0; block < ParallelMath::ParallelSize; block++) 6633 { 6634 for (int px = 0; px < 16; px += 8) 6635 { 6636 int index0 = ParallelMath::Extract(indexes[px], block); 6637 int index1 = ParallelMath::Extract(indexes[px], block); 6638 6639 packedBlocks[px / 2] = static_cast<uint8_t>(index0 | (index1 << 4)); 6640 } 6641 6642 packedBlocks += packedBlockStride; 6643 } 6644 } 6645 PackInterpolatedAlpha(uint32_t flags,const PixelBlockU8 * inputs,int inputChannel,uint8_t * packedBlocks,size_t packedBlockStride,bool isSigned,int maxTweakRounds,int numRefineRounds)6646 static void PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds) 6647 { 6648 if (maxTweakRounds < 1) 6649 maxTweakRounds = 1; 6650 6651 if (numRefineRounds < 1) 6652 numRefineRounds = 1; 6653 6654 ParallelMath::RoundTowardNearestForScope rtn; 6655 6656 float oneWeight[1] = { 1.0f }; 6657 6658 MUInt15 pixels[16]; 6659 MFloat floatPixels[16]; 6660 6661 MUInt15 highTerminal = isSigned ? ParallelMath::MakeUInt15(254) : ParallelMath::MakeUInt15(255); 6662 MUInt15 highTerminalMinusOne = highTerminal - ParallelMath::MakeUInt15(1); 6663 6664 for (int px = 0; px < 16; px++) 6665 { 6666 ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]); 6667 6668 if (isSigned) 6669 pixels[px] = ParallelMath::Min(pixels[px], highTerminal); 6670 6671 floatPixels[px] = ParallelMath::ToFloat(pixels[px]); 6672 } 6673 6674 MUInt15 sortedPixels[16]; 6675 for (int px = 0; px < 16; px++) 6676 sortedPixels[px] = pixels[px]; 6677 6678 for (int sortEnd = 15; sortEnd > 0; sortEnd--) 6679 { 6680 for (int sortOffset = 0; sortOffset < sortEnd; sortOffset++) 6681 { 6682 MUInt15 a = sortedPixels[sortOffset]; 6683 MUInt15 b = sortedPixels[sortOffset + 1]; 6684 6685 sortedPixels[sortOffset] = ParallelMath::Min(a, b); 6686 sortedPixels[sortOffset + 1] = ParallelMath::Max(a, b); 6687 } 6688 } 6689 6690 MUInt15 zero = ParallelMath::MakeUInt15(0); 6691 MUInt15 one = ParallelMath::MakeUInt15(1); 6692 6693 MUInt15 bestIsFullRange = zero; 6694 MFloat bestError = ParallelMath::MakeFloat(FLT_MAX); 6695 MUInt15 bestEP[2] = { zero, zero }; 6696 MUInt15 bestIndexes[16] = { 6697 zero, zero, zero, zero, 6698 zero, zero, zero, zero, 6699 zero, zero, zero, zero, 6700 zero, zero, zero, zero 6701 }; 6702 6703 // Full-precision 6704 { 6705 MUInt15 minEP = sortedPixels[0]; 6706 MUInt15 maxEP = sortedPixels[15]; 6707 6708 MFloat base[1] = { ParallelMath::ToFloat(minEP) }; 6709 MFloat offset[1] = { ParallelMath::ToFloat(maxEP - minEP) }; 6710 6711 UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset); 6712 6713 int numTweakRounds = BCCommon::TweakRoundsForRange(8); 6714 if (numTweakRounds > maxTweakRounds) 6715 numTweakRounds = maxTweakRounds; 6716 6717 for (int tweak = 0; tweak < numTweakRounds; tweak++) 6718 { 6719 MUInt15 ep[2][1]; 6720 6721 ufep.FinishLDR(tweak, 8, ep[0], ep[1]); 6722 6723 for (int refinePass = 0; refinePass < numRefineRounds; refinePass++) 6724 { 6725 EndpointRefiner<1> refiner; 6726 refiner.Init(8, oneWeight); 6727 6728 if (isSigned) 6729 for (int epi = 0; epi < 2; epi++) 6730 ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal); 6731 6732 IndexSelector<1> indexSelector; 6733 indexSelector.Init<false>(oneWeight, ep, 8); 6734 6735 MUInt15 indexes[16]; 6736 6737 AggregatedError<1> aggError; 6738 for (int px = 0; px < 16; px++) 6739 { 6740 MUInt15 index = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn); 6741 6742 MUInt15 reconstructedPixel; 6743 6744 indexSelector.ReconstructLDRPrecise(index, &reconstructedPixel); 6745 BCCommon::ComputeErrorLDR<1>(flags, &reconstructedPixel, &pixels[px], aggError); 6746 6747 if (refinePass != numRefineRounds - 1) 6748 refiner.ContributeUnweightedPW(&floatPixels[px], index); 6749 6750 indexes[px] = index; 6751 } 6752 MFloat error = aggError.Finalize(flags | Flags::Uniform, oneWeight); 6753 6754 ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError); 6755 ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter); 6756 6757 if (ParallelMath::AnySet(errorBetter16)) 6758 { 6759 bestError = ParallelMath::Min(error, bestError); 6760 ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, one); 6761 for (int px = 0; px < 16; px++) 6762 ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]); 6763 6764 for (int epi = 0; epi < 2; epi++) 6765 ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]); 6766 } 6767 6768 if (refinePass != numRefineRounds - 1) 6769 refiner.GetRefinedEndpointsLDR(ep, &rtn); 6770 } 6771 } 6772 } 6773 6774 // Reduced precision with special endpoints 6775 { 6776 MUInt15 bestHeuristicMin = sortedPixels[0]; 6777 MUInt15 bestHeuristicMax = sortedPixels[15]; 6778 6779 ParallelMath::Int16CompFlag canTryClipping; 6780 6781 // In reduced precision, we want try putting endpoints at the reserved indexes at the ends. 6782 // The heuristic we use is to assign indexes to the end as long as they aren't off by more than half of the index range. 6783 // This will usually not find anything, but it's cheap to check. 6784 6785 { 6786 MUInt15 largestPossibleRange = bestHeuristicMax - bestHeuristicMin; // Max: 255 6787 MUInt15 lowestPossibleClearance = ParallelMath::Min(bestHeuristicMin, static_cast<MUInt15>(highTerminal - bestHeuristicMax)); 6788 6789 MUInt15 lowestPossibleClearanceTimes10 = (lowestPossibleClearance << 2) + (lowestPossibleClearance << 4); 6790 canTryClipping = ParallelMath::LessOrEqual(lowestPossibleClearanceTimes10, largestPossibleRange); 6791 } 6792 6793 if (ParallelMath::AnySet(canTryClipping)) 6794 { 6795 MUInt15 lowClearances[16]; 6796 MUInt15 highClearances[16]; 6797 MUInt15 bestSkipCount = ParallelMath::MakeUInt15(0); 6798 6799 lowClearances[0] = highClearances[0] = ParallelMath::MakeUInt15(0); 6800 6801 for (int px = 1; px < 16; px++) 6802 { 6803 lowClearances[px] = sortedPixels[px - 1]; 6804 highClearances[px] = highTerminal - sortedPixels[16 - px]; 6805 } 6806 6807 for (uint16_t firstIndex = 0; firstIndex < 16; firstIndex++) 6808 { 6809 uint16_t numSkippedLow = firstIndex; 6810 6811 MUInt15 lowClearance = lowClearances[firstIndex]; 6812 6813 for (uint16_t lastIndex = firstIndex; lastIndex < 16; lastIndex++) 6814 { 6815 uint16_t numSkippedHigh = 15 - lastIndex; 6816 uint16_t numSkipped = numSkippedLow + numSkippedHigh; 6817 6818 MUInt15 numSkippedV = ParallelMath::MakeUInt15(numSkipped); 6819 6820 ParallelMath::Int16CompFlag areMoreSkipped = ParallelMath::Less(bestSkipCount, numSkippedV); 6821 6822 if (!ParallelMath::AnySet(areMoreSkipped)) 6823 continue; 6824 6825 MUInt15 clearance = ParallelMath::Max(highClearances[numSkippedHigh], lowClearance); 6826 MUInt15 clearanceTimes10 = (clearance << 2) + (clearance << 4); 6827 6828 MUInt15 range = sortedPixels[lastIndex] - sortedPixels[firstIndex]; 6829 6830 ParallelMath::Int16CompFlag isBetter = (areMoreSkipped & ParallelMath::LessOrEqual(clearanceTimes10, range)); 6831 ParallelMath::ConditionalSet(bestHeuristicMin, isBetter, sortedPixels[firstIndex]); 6832 ParallelMath::ConditionalSet(bestHeuristicMax, isBetter, sortedPixels[lastIndex]); 6833 } 6834 } 6835 } 6836 6837 MUInt15 bestSimpleMin = one; 6838 MUInt15 bestSimpleMax = highTerminalMinusOne; 6839 6840 for (int px = 0; px < 16; px++) 6841 { 6842 ParallelMath::ConditionalSet(bestSimpleMin, ParallelMath::Less(zero, sortedPixels[15 - px]), sortedPixels[15 - px]); 6843 ParallelMath::ConditionalSet(bestSimpleMax, ParallelMath::Less(sortedPixels[px], highTerminal), sortedPixels[px]); 6844 } 6845 6846 MUInt15 minEPs[2] = { bestSimpleMin, bestHeuristicMin }; 6847 MUInt15 maxEPs[2] = { bestSimpleMax, bestHeuristicMax }; 6848 6849 int minEPRange = 2; 6850 if (ParallelMath::AllSet(ParallelMath::Equal(minEPs[0], minEPs[1]))) 6851 minEPRange = 1; 6852 6853 int maxEPRange = 2; 6854 if (ParallelMath::AllSet(ParallelMath::Equal(maxEPs[0], maxEPs[1]))) 6855 maxEPRange = 1; 6856 6857 for (int minEPIndex = 0; minEPIndex < minEPRange; minEPIndex++) 6858 { 6859 for (int maxEPIndex = 0; maxEPIndex < maxEPRange; maxEPIndex++) 6860 { 6861 MFloat base[1] = { ParallelMath::ToFloat(minEPs[minEPIndex]) }; 6862 MFloat offset[1] = { ParallelMath::ToFloat(maxEPs[maxEPIndex] - minEPs[minEPIndex]) }; 6863 6864 UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset); 6865 6866 int numTweakRounds = BCCommon::TweakRoundsForRange(6); 6867 if (numTweakRounds > maxTweakRounds) 6868 numTweakRounds = maxTweakRounds; 6869 6870 for (int tweak = 0; tweak < numTweakRounds; tweak++) 6871 { 6872 MUInt15 ep[2][1]; 6873 6874 ufep.FinishLDR(tweak, 8, ep[0], ep[1]); 6875 6876 for (int refinePass = 0; refinePass < numRefineRounds; refinePass++) 6877 { 6878 EndpointRefiner<1> refiner; 6879 refiner.Init(6, oneWeight); 6880 6881 if (isSigned) 6882 for (int epi = 0; epi < 2; epi++) 6883 ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal); 6884 6885 IndexSelector<1> indexSelector; 6886 indexSelector.Init<false>(oneWeight, ep, 6); 6887 6888 MUInt15 indexes[16]; 6889 MFloat error = ParallelMath::MakeFloatZero(); 6890 6891 for (int px = 0; px < 16; px++) 6892 { 6893 MUInt15 selectedIndex = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn); 6894 6895 MUInt15 reconstructedPixel; 6896 6897 indexSelector.ReconstructLDRPrecise(selectedIndex, &reconstructedPixel); 6898 6899 MFloat zeroError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &zero, &pixels[px], 1, oneWeight); 6900 MFloat highTerminalError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &highTerminal, &pixels[px], 1, oneWeight); 6901 MFloat selectedIndexError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &reconstructedPixel, &pixels[px], 1, oneWeight); 6902 6903 MFloat bestPixelError = zeroError; 6904 MUInt15 index = ParallelMath::MakeUInt15(6); 6905 6906 ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(ParallelMath::Less(highTerminalError, bestPixelError)), ParallelMath::MakeUInt15(7)); 6907 bestPixelError = ParallelMath::Min(bestPixelError, highTerminalError); 6908 6909 ParallelMath::FloatCompFlag selectedIndexBetter = ParallelMath::Less(selectedIndexError, bestPixelError); 6910 6911 if (ParallelMath::AllSet(selectedIndexBetter)) 6912 { 6913 if (refinePass != numRefineRounds - 1) 6914 refiner.ContributeUnweightedPW(&floatPixels[px], selectedIndex); 6915 } 6916 else 6917 { 6918 MFloat refineWeight = ParallelMath::Select(selectedIndexBetter, ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloatZero()); 6919 6920 if (refinePass != numRefineRounds - 1) 6921 refiner.ContributePW(&floatPixels[px], selectedIndex, refineWeight); 6922 } 6923 6924 ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(selectedIndexBetter), selectedIndex); 6925 bestPixelError = ParallelMath::Min(bestPixelError, selectedIndexError); 6926 6927 error = error + bestPixelError; 6928 6929 indexes[px] = index; 6930 } 6931 6932 ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError); 6933 ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter); 6934 6935 if (ParallelMath::AnySet(errorBetter16)) 6936 { 6937 bestError = ParallelMath::Min(error, bestError); 6938 ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, zero); 6939 for (int px = 0; px < 16; px++) 6940 ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]); 6941 6942 for (int epi = 0; epi < 2; epi++) 6943 ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]); 6944 } 6945 6946 if (refinePass != numRefineRounds - 1) 6947 refiner.GetRefinedEndpointsLDR(ep, &rtn); 6948 } 6949 } 6950 } 6951 } 6952 } 6953 6954 for (int block = 0; block < ParallelMath::ParallelSize; block++) 6955 { 6956 int ep0 = ParallelMath::Extract(bestEP[0], block); 6957 int ep1 = ParallelMath::Extract(bestEP[1], block); 6958 int isFullRange = ParallelMath::Extract(bestIsFullRange, block); 6959 6960 if (isSigned) 6961 { 6962 ep0 -= 127; 6963 ep1 -= 127; 6964 6965 assert(ep0 >= -127 && ep0 <= 127); 6966 assert(ep1 >= -127 && ep1 <= 127); 6967 } 6968 6969 6970 bool swapEndpoints = (isFullRange != 0) != (ep0 > ep1); 6971 6972 if (swapEndpoints) 6973 std::swap(ep0, ep1); 6974 6975 uint16_t dumpBits = 0; 6976 int dumpBitsOffset = 0; 6977 int dumpByteOffset = 2; 6978 packedBlocks[0] = static_cast<uint8_t>(ep0 & 0xff); 6979 packedBlocks[1] = static_cast<uint8_t>(ep1 & 0xff); 6980 6981 int maxValue = (isFullRange != 0) ? 7 : 5; 6982 6983 for (int px = 0; px < 16; px++) 6984 { 6985 int index = ParallelMath::Extract(bestIndexes[px], block); 6986 6987 if (swapEndpoints && index <= maxValue) 6988 index = maxValue - index; 6989 6990 if (index != 0) 6991 { 6992 if (index == maxValue) 6993 index = 1; 6994 else if (index < maxValue) 6995 index++; 6996 } 6997 6998 assert(index >= 0 && index < 8); 6999 7000 dumpBits |= static_cast<uint16_t>(index << dumpBitsOffset); 7001 dumpBitsOffset += 3; 7002 7003 if (dumpBitsOffset >= 8) 7004 { 7005 assert(dumpByteOffset < 8); 7006 packedBlocks[dumpByteOffset] = static_cast<uint8_t>(dumpBits & 0xff); 7007 dumpBits >>= 8; 7008 dumpBitsOffset -= 8; 7009 dumpByteOffset++; 7010 } 7011 } 7012 7013 assert(dumpBitsOffset == 0); 7014 assert(dumpByteOffset == 8); 7015 7016 packedBlocks += packedBlockStride; 7017 } 7018 } 7019 PackRGB(uint32_t flags,const PixelBlockU8 * inputs,uint8_t * packedBlocks,size_t packedBlockStride,const float channelWeights[4],bool alphaTest,float alphaThreshold,bool exhaustive,int maxTweakRounds,int numRefineRounds)7020 static void PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds) 7021 { 7022 ParallelMath::RoundTowardNearestForScope rtn; 7023 7024 if (numRefineRounds < 1) 7025 numRefineRounds = 1; 7026 7027 if (maxTweakRounds < 1) 7028 maxTweakRounds = 1; 7029 7030 EndpointSelector<3, 8> endpointSelector; 7031 7032 MUInt15 pixels[16][4]; 7033 MFloat floatPixels[16][4]; 7034 7035 MFloat preWeightedPixels[16][4]; 7036 7037 for (int px = 0; px < 16; px++) 7038 { 7039 for (int ch = 0; ch < 4; ch++) 7040 ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]); 7041 } 7042 7043 for (int px = 0; px < 16; px++) 7044 { 7045 for (int ch = 0; ch < 4; ch++) 7046 floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]); 7047 } 7048 7049 if (alphaTest) 7050 { 7051 MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(floor(alphaThreshold * 255.0f + 0.5f))); 7052 7053 for (int px = 0; px < 16; px++) 7054 { 7055 ParallelMath::Int16CompFlag belowThreshold = ParallelMath::Less(pixels[px][3], threshold); 7056 pixels[px][3] = ParallelMath::Select(belowThreshold, ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(255)); 7057 } 7058 } 7059 7060 BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights); 7061 7062 MUInt15 minAlpha = ParallelMath::MakeUInt15(255); 7063 7064 for (int px = 0; px < 16; px++) 7065 minAlpha = ParallelMath::Min(minAlpha, pixels[px][3]); 7066 7067 MFloat pixelWeights[16]; 7068 for (int px = 0; px < 16; px++) 7069 { 7070 pixelWeights[px] = ParallelMath::MakeFloat(1.0f); 7071 if (alphaTest) 7072 { 7073 ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255)); 7074 7075 ParallelMath::ConditionalSet(pixelWeights[px], ParallelMath::Int16FlagToFloat(isTransparent), ParallelMath::MakeFloatZero()); 7076 } 7077 } 7078 7079 for (int pass = 0; pass < NumEndpointSelectorPasses; pass++) 7080 { 7081 for (int px = 0; px < 16; px++) 7082 endpointSelector.ContributePass(preWeightedPixels[px], pass, pixelWeights[px]); 7083 7084 endpointSelector.FinishPass(pass); 7085 } 7086 7087 UnfinishedEndpoints<3> ufep = endpointSelector.GetEndpoints(channelWeights); 7088 7089 MUInt15 bestEndpoints[2][3]; 7090 MUInt15 bestIndexes[16]; 7091 MUInt15 bestRange = ParallelMath::MakeUInt15(0); 7092 MFloat bestError = ParallelMath::MakeFloat(FLT_MAX); 7093 7094 for (int px = 0; px < 16; px++) 7095 bestIndexes[px] = ParallelMath::MakeUInt15(0); 7096 7097 for (int ep = 0; ep < 2; ep++) 7098 for (int ch = 0; ch < 3; ch++) 7099 bestEndpoints[ep][ch] = ParallelMath::MakeUInt15(0); 7100 7101 if (exhaustive) 7102 { 7103 MSInt16 sortBins[16]; 7104 7105 { 7106 // Compute an 11-bit index, change it to signed, stuff it in the high bits of the sort bins, 7107 // and pack the original indexes into the low bits. 7108 7109 MUInt15 sortEP[2][3]; 7110 ufep.FinishLDR(0, 11, sortEP[0], sortEP[1]); 7111 7112 IndexSelector<3> sortSelector; 7113 sortSelector.Init<false>(channelWeights, sortEP, 1 << 11); 7114 7115 for (int16_t px = 0; px < 16; px++) 7116 { 7117 MSInt16 sortBin = ParallelMath::LosslessCast<MSInt16>::Cast(sortSelector.SelectIndexLDR(floatPixels[px], &rtn) << 4); 7118 7119 if (alphaTest) 7120 { 7121 ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255)); 7122 7123 ParallelMath::ConditionalSet(sortBin, isTransparent, ParallelMath::MakeSInt16(-16)); // 0xfff0 7124 } 7125 7126 sortBin = sortBin + ParallelMath::MakeSInt16(px); 7127 7128 sortBins[px] = sortBin; 7129 } 7130 } 7131 7132 // Sort bins 7133 for (int sortEnd = 1; sortEnd < 16; sortEnd++) 7134 { 7135 for (int sortLoc = sortEnd; sortLoc > 0; sortLoc--) 7136 { 7137 MSInt16 a = sortBins[sortLoc]; 7138 MSInt16 b = sortBins[sortLoc - 1]; 7139 7140 sortBins[sortLoc] = ParallelMath::Max(a, b); 7141 sortBins[sortLoc - 1] = ParallelMath::Min(a, b); 7142 } 7143 } 7144 7145 MUInt15 firstElement = ParallelMath::MakeUInt15(0); 7146 for (uint16_t e = 0; e < 16; e++) 7147 { 7148 ParallelMath::Int16CompFlag isInvalid = ParallelMath::Less(sortBins[e], ParallelMath::MakeSInt16(0)); 7149 ParallelMath::ConditionalSet(firstElement, isInvalid, ParallelMath::MakeUInt15(e + 1)); 7150 if (!ParallelMath::AnySet(isInvalid)) 7151 break; 7152 } 7153 7154 MUInt15 numElements = ParallelMath::MakeUInt15(16) - firstElement; 7155 7156 MUInt15 sortedInputs[16][4]; 7157 MFloat floatSortedInputs[16][4]; 7158 MFloat pwFloatSortedInputs[16][4]; 7159 7160 for (int e = 0; e < 16; e++) 7161 { 7162 for (int ch = 0; ch < 4; ch++) 7163 sortedInputs[e][ch] = ParallelMath::MakeUInt15(0); 7164 } 7165 7166 for (int block = 0; block < ParallelMath::ParallelSize; block++) 7167 { 7168 for (int e = ParallelMath::Extract(firstElement, block); e < 16; e++) 7169 { 7170 ParallelMath::ScalarUInt16 sortBin = ParallelMath::Extract(sortBins[e], block); 7171 int originalIndex = (sortBin & 15); 7172 7173 for (int ch = 0; ch < 4; ch++) 7174 ParallelMath::PutUInt15(sortedInputs[15 - e][ch], block, ParallelMath::Extract(pixels[originalIndex][ch], block)); 7175 } 7176 } 7177 7178 for (int e = 0; e < 16; e++) 7179 { 7180 for (int ch = 0; ch < 4; ch++) 7181 { 7182 MFloat f = ParallelMath::ToFloat(sortedInputs[e][ch]); 7183 floatSortedInputs[e][ch] = f; 7184 pwFloatSortedInputs[e][ch] = f * channelWeights[ch]; 7185 } 7186 } 7187 7188 for (int n0 = 0; n0 <= 15; n0++) 7189 { 7190 int remainingFor1 = 16 - n0; 7191 if (remainingFor1 == 16) 7192 remainingFor1 = 15; 7193 7194 for (int n1 = 0; n1 <= remainingFor1; n1++) 7195 { 7196 int remainingFor2 = 16 - n1 - n0; 7197 if (remainingFor2 == 16) 7198 remainingFor2 = 15; 7199 7200 for (int n2 = 0; n2 <= remainingFor2; n2++) 7201 { 7202 int n3 = 16 - n2 - n1 - n0; 7203 7204 if (n3 == 16) 7205 continue; 7206 7207 int counts[4] = { n0, n1, n2, n3 }; 7208 7209 TestCounts(flags, counts, 4, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn); 7210 } 7211 } 7212 } 7213 7214 TestSingleColor(flags, pixels, floatPixels, 4, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn); 7215 7216 if (alphaTest) 7217 { 7218 for (int n0 = 0; n0 <= 15; n0++) 7219 { 7220 int remainingFor1 = 16 - n0; 7221 if (remainingFor1 == 16) 7222 remainingFor1 = 15; 7223 7224 for (int n1 = 0; n1 <= remainingFor1; n1++) 7225 { 7226 int n2 = 16 - n1 - n0; 7227 7228 if (n2 == 16) 7229 continue; 7230 7231 int counts[3] = { n0, n1, n2 }; 7232 7233 TestCounts(flags, counts, 3, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn); 7234 } 7235 } 7236 7237 TestSingleColor(flags, pixels, floatPixels, 3, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn); 7238 } 7239 } 7240 else 7241 { 7242 int minRange = alphaTest ? 3 : 4; 7243 7244 for (int range = minRange; range <= 4; range++) 7245 { 7246 int tweakRounds = BCCommon::TweakRoundsForRange(range); 7247 if (tweakRounds > maxTweakRounds) 7248 tweakRounds = maxTweakRounds; 7249 7250 for (int tweak = 0; tweak < tweakRounds; tweak++) 7251 { 7252 MUInt15 endPoints[2][3]; 7253 7254 ufep.FinishLDR(tweak, range, endPoints[0], endPoints[1]); 7255 7256 for (int refine = 0; refine < numRefineRounds; refine++) 7257 { 7258 EndpointRefiner<3> refiner; 7259 refiner.Init(range, channelWeights); 7260 7261 TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, range, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &refiner, &rtn); 7262 7263 if (refine != numRefineRounds - 1) 7264 refiner.GetRefinedEndpointsLDR(endPoints, &rtn); 7265 } 7266 } 7267 } 7268 } 7269 7270 for (int block = 0; block < ParallelMath::ParallelSize; block++) 7271 { 7272 ParallelMath::ScalarUInt16 range = ParallelMath::Extract(bestRange, block); 7273 assert(range == 3 || range == 4); 7274 7275 ParallelMath::ScalarUInt16 compressedEP[2]; 7276 for (int ep = 0; ep < 2; ep++) 7277 { 7278 ParallelMath::ScalarUInt16 endPoint[3]; 7279 for (int ch = 0; ch < 3; ch++) 7280 endPoint[ch] = ParallelMath::Extract(bestEndpoints[ep][ch], block); 7281 7282 int compressed = (endPoint[0] & 0xf8) << 8; 7283 compressed |= (endPoint[1] & 0xfc) << 3; 7284 compressed |= (endPoint[2] & 0xf8) >> 3; 7285 7286 compressedEP[ep] = static_cast<ParallelMath::ScalarUInt16>(compressed); 7287 } 7288 7289 int indexOrder[4]; 7290 7291 if (range == 4) 7292 { 7293 if (compressedEP[0] == compressedEP[1]) 7294 { 7295 indexOrder[0] = 0; 7296 indexOrder[1] = 0; 7297 indexOrder[2] = 0; 7298 indexOrder[3] = 0; 7299 } 7300 else if (compressedEP[0] < compressedEP[1]) 7301 { 7302 std::swap(compressedEP[0], compressedEP[1]); 7303 indexOrder[0] = 1; 7304 indexOrder[1] = 3; 7305 indexOrder[2] = 2; 7306 indexOrder[3] = 0; 7307 } 7308 else 7309 { 7310 indexOrder[0] = 0; 7311 indexOrder[1] = 2; 7312 indexOrder[2] = 3; 7313 indexOrder[3] = 1; 7314 } 7315 } 7316 else 7317 { 7318 assert(range == 3); 7319 7320 if (compressedEP[0] > compressedEP[1]) 7321 { 7322 std::swap(compressedEP[0], compressedEP[1]); 7323 indexOrder[0] = 1; 7324 indexOrder[1] = 2; 7325 indexOrder[2] = 0; 7326 } 7327 else 7328 { 7329 indexOrder[0] = 0; 7330 indexOrder[1] = 2; 7331 indexOrder[2] = 1; 7332 } 7333 indexOrder[3] = 3; 7334 } 7335 7336 packedBlocks[0] = static_cast<uint8_t>(compressedEP[0] & 0xff); 7337 packedBlocks[1] = static_cast<uint8_t>((compressedEP[0] >> 8) & 0xff); 7338 packedBlocks[2] = static_cast<uint8_t>(compressedEP[1] & 0xff); 7339 packedBlocks[3] = static_cast<uint8_t>((compressedEP[1] >> 8) & 0xff); 7340 7341 for (int i = 0; i < 16; i += 4) 7342 { 7343 int packedIndexes = 0; 7344 for (int subi = 0; subi < 4; subi++) 7345 { 7346 ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[i + subi], block); 7347 packedIndexes |= (indexOrder[index] << (subi * 2)); 7348 } 7349 7350 packedBlocks[4 + i / 4] = static_cast<uint8_t>(packedIndexes); 7351 } 7352 7353 packedBlocks += packedBlockStride; 7354 } 7355 } 7356 }; 7357 7358 // Signed input blocks are converted into unsigned space, with the maximum value being 254 BiasSignedInput(PixelBlockU8 inputNormalized[ParallelMath::ParallelSize],const PixelBlockS8 inputSigned[ParallelMath::ParallelSize])7359 void BiasSignedInput(PixelBlockU8 inputNormalized[ParallelMath::ParallelSize], const PixelBlockS8 inputSigned[ParallelMath::ParallelSize]) 7360 { 7361 for (size_t block = 0; block < ParallelMath::ParallelSize; block++) 7362 { 7363 const PixelBlockS8& inputSignedBlock = inputSigned[block]; 7364 PixelBlockU8& inputNormalizedBlock = inputNormalized[block]; 7365 7366 for (size_t px = 0; px < 16; px++) 7367 { 7368 for (size_t ch = 0; ch < 4; ch++) 7369 inputNormalizedBlock.m_pixels[px][ch] = static_cast<uint8_t>(std::max<int>(inputSignedBlock.m_pixels[px][ch], -127) + 127); 7370 } 7371 } 7372 } 7373 FillWeights(const Options & options,float channelWeights[4])7374 void FillWeights(const Options &options, float channelWeights[4]) 7375 { 7376 if (options.flags & Flags::Uniform) 7377 channelWeights[0] = channelWeights[1] = channelWeights[2] = channelWeights[3] = 1.0f; 7378 else 7379 { 7380 channelWeights[0] = options.redWeight; 7381 channelWeights[1] = options.greenWeight; 7382 channelWeights[2] = options.blueWeight; 7383 channelWeights[3] = options.alphaWeight; 7384 } 7385 } 7386 } 7387 7388 namespace Kernels 7389 { EncodeBC7(uint8_t * pBC,const PixelBlockU8 * pBlocks,const cvtt::Options & options)7390 void EncodeBC7(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options) 7391 { 7392 assert(pBlocks); 7393 assert(pBC); 7394 7395 float channelWeights[4]; 7396 Internal::FillWeights(options, channelWeights); 7397 7398 for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize) 7399 { 7400 Internal::BC7Computer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, options.seedPoints, options.refineRoundsBC7); 7401 pBC += ParallelMath::ParallelSize * 16; 7402 } 7403 } 7404 EncodeBC6HU(uint8_t * pBC,const PixelBlockF16 * pBlocks,const cvtt::Options & options)7405 void EncodeBC6HU(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options) 7406 { 7407 assert(pBlocks); 7408 assert(pBC); 7409 7410 float channelWeights[4]; 7411 Internal::FillWeights(options, channelWeights); 7412 7413 for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize) 7414 { 7415 Internal::BC6HComputer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, false, options.seedPoints, options.refineRoundsBC6H); 7416 pBC += ParallelMath::ParallelSize * 16; 7417 } 7418 } 7419 EncodeBC6HS(uint8_t * pBC,const PixelBlockF16 * pBlocks,const cvtt::Options & options)7420 void EncodeBC6HS(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options) 7421 { 7422 assert(pBlocks); 7423 assert(pBC); 7424 7425 float channelWeights[4]; 7426 Internal::FillWeights(options, channelWeights); 7427 7428 for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize) 7429 { 7430 Internal::BC6HComputer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, true, options.seedPoints, options.refineRoundsBC6H); 7431 pBC += ParallelMath::ParallelSize * 16; 7432 } 7433 } 7434 EncodeBC1(uint8_t * pBC,const PixelBlockU8 * pBlocks,const cvtt::Options & options)7435 void EncodeBC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options) 7436 { 7437 assert(pBlocks); 7438 assert(pBC); 7439 7440 float channelWeights[4]; 7441 Internal::FillWeights(options, channelWeights); 7442 7443 for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize) 7444 { 7445 Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC, 8, channelWeights, true, options.threshold, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC); 7446 pBC += ParallelMath::ParallelSize * 8; 7447 } 7448 } 7449 EncodeBC2(uint8_t * pBC,const PixelBlockU8 * pBlocks,const Options & options)7450 void EncodeBC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options) 7451 { 7452 assert(pBlocks); 7453 assert(pBC); 7454 7455 float channelWeights[4]; 7456 Internal::FillWeights(options, channelWeights); 7457 7458 for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize) 7459 { 7460 Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC + 8, 16, channelWeights, false, 1.0f, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC); 7461 Internal::S3TCComputer::PackExplicitAlpha(options.flags, pBlocks + blockBase, 3, pBC, 16); 7462 pBC += ParallelMath::ParallelSize * 16; 7463 } 7464 } 7465 EncodeBC3(uint8_t * pBC,const PixelBlockU8 * pBlocks,const Options & options)7466 void EncodeBC3(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options) 7467 { 7468 assert(pBlocks); 7469 assert(pBC); 7470 7471 float channelWeights[4]; 7472 Internal::FillWeights(options, channelWeights); 7473 7474 for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize) 7475 { 7476 Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC + 8, 16, channelWeights, false, 1.0f, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC); 7477 Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 3, pBC, 16, false, options.seedPoints, options.refineRoundsIIC); 7478 pBC += ParallelMath::ParallelSize * 16; 7479 } 7480 } 7481 EncodeBC4U(uint8_t * pBC,const PixelBlockU8 * pBlocks,const Options & options)7482 void EncodeBC4U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options) 7483 { 7484 assert(pBlocks); 7485 assert(pBC); 7486 7487 float channelWeights[4]; 7488 Internal::FillWeights(options, channelWeights); 7489 7490 for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize) 7491 { 7492 Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 0, pBC, 8, false, options.seedPoints, options.refineRoundsIIC); 7493 pBC += ParallelMath::ParallelSize * 8; 7494 } 7495 } 7496 EncodeBC4S(uint8_t * pBC,const PixelBlockS8 * pBlocks,const Options & options)7497 void EncodeBC4S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options) 7498 { 7499 assert(pBlocks); 7500 assert(pBC); 7501 7502 float channelWeights[4]; 7503 Internal::FillWeights(options, channelWeights); 7504 7505 for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize) 7506 { 7507 PixelBlockU8 inputBlocks[ParallelMath::ParallelSize]; 7508 Internal::BiasSignedInput(inputBlocks, pBlocks + blockBase); 7509 7510 Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 8, true, options.seedPoints, options.refineRoundsIIC); 7511 pBC += ParallelMath::ParallelSize * 8; 7512 } 7513 } 7514 EncodeBC5U(uint8_t * pBC,const PixelBlockU8 * pBlocks,const Options & options)7515 void EncodeBC5U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options) 7516 { 7517 assert(pBlocks); 7518 assert(pBC); 7519 7520 float channelWeights[4]; 7521 Internal::FillWeights(options, channelWeights); 7522 7523 for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize) 7524 { 7525 Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 0, pBC, 16, false, options.seedPoints, options.refineRoundsIIC); 7526 Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 1, pBC + 8, 16, false, options.seedPoints, options.refineRoundsIIC); 7527 pBC += ParallelMath::ParallelSize * 16; 7528 } 7529 } 7530 EncodeBC5S(uint8_t * pBC,const PixelBlockS8 * pBlocks,const Options & options)7531 void EncodeBC5S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options) 7532 { 7533 assert(pBlocks); 7534 assert(pBC); 7535 7536 float channelWeights[4]; 7537 Internal::FillWeights(options, channelWeights); 7538 7539 for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize) 7540 { 7541 PixelBlockU8 inputBlocks[ParallelMath::ParallelSize]; 7542 Internal::BiasSignedInput(inputBlocks, pBlocks + blockBase); 7543 7544 Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 16, true, options.seedPoints, options.refineRoundsIIC); 7545 Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 1, pBC + 8, 16, true, options.seedPoints, options.refineRoundsIIC); 7546 pBC += ParallelMath::ParallelSize * 16; 7547 } 7548 } 7549 DecodeBC7(PixelBlockU8 * pBlocks,const uint8_t * pBC)7550 void DecodeBC7(PixelBlockU8 *pBlocks, const uint8_t *pBC) 7551 { 7552 assert(pBlocks); 7553 assert(pBC); 7554 7555 for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++) 7556 { 7557 Internal::BC7Computer::UnpackOne(pBlocks[blockBase], pBC); 7558 pBC += 16; 7559 } 7560 } 7561 DecodeBC6HU(PixelBlockF16 * pBlocks,const uint8_t * pBC)7562 void DecodeBC6HU(PixelBlockF16 *pBlocks, const uint8_t *pBC) 7563 { 7564 assert(pBlocks); 7565 assert(pBC); 7566 7567 for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++) 7568 { 7569 Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, false); 7570 pBC += 16; 7571 } 7572 } 7573 DecodeBC6HS(PixelBlockF16 * pBlocks,const uint8_t * pBC)7574 void DecodeBC6HS(PixelBlockF16 *pBlocks, const uint8_t *pBC) 7575 { 7576 assert(pBlocks); 7577 assert(pBC); 7578 7579 for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++) 7580 { 7581 Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, true); 7582 pBC += 16; 7583 } 7584 } 7585 } 7586 } 7587