1 /*
2 Convection Texture Tools
3 Copyright (c) 2018 Eric Lasota
4 
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject
11 to the following conditions:
12 
13 The above copyright notice and this permission notice shall be included
14 in all copies or substantial portions of the Software.
15 
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 
24 -------------------------------------------------------------------------------------
25 
26 Portions based on DirectX Texture Library (DirectXTex)
27 
28 Copyright (c) Microsoft Corporation. All rights reserved.
29 Licensed under the MIT License.
30 
31 http://go.microsoft.com/fwlink/?LinkId=248926
32 */
33 #include "ConvectionKernels.h"
34 #include "ConvectionKernels_BC7_SingleColor.h"
35 
36 #if (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(_M_X64) || defined(__SSE2__)
37 #define CVTT_USE_SSE2
38 #endif
39 
40 #ifdef CVTT_USE_SSE2
41 #include <emmintrin.h>
42 #endif
43 
44 #include <float.h>
45 #include <assert.h>
46 #include <string.h>
47 #include <algorithm>
48 #include <math.h>
49 
50 #define UNREFERENCED_PARAMETER(n) ((void)n)
51 
52 namespace cvtt
53 {
54 #ifdef CVTT_USE_SSE2
55     // SSE2 version
56     struct ParallelMath
57     {
58         typedef uint16_t ScalarUInt16;
59         typedef int16_t ScalarSInt16;
60 
61         template<unsigned int TRoundingMode>
62         struct RoundForScope
63         {
64             unsigned int m_oldCSR;
65 
RoundForScopecvtt::ParallelMath::RoundForScope66             RoundForScope()
67             {
68                 m_oldCSR = _mm_getcsr();
69                 _mm_setcsr((m_oldCSR & ~_MM_ROUND_MASK) | (TRoundingMode));
70             }
71 
~RoundForScopecvtt::ParallelMath::RoundForScope72             ~RoundForScope()
73             {
74                 _mm_setcsr(m_oldCSR);
75             }
76         };
77 
78         struct RoundTowardZeroForScope : RoundForScope<_MM_ROUND_TOWARD_ZERO>
79         {
80         };
81 
82         struct RoundTowardNearestForScope : RoundForScope<_MM_ROUND_NEAREST>
83         {
84         };
85 
86         struct RoundUpForScope : RoundForScope<_MM_ROUND_UP>
87         {
88         };
89 
90         struct RoundDownForScope : RoundForScope<_MM_ROUND_DOWN>
91         {
92         };
93 
94         static const int ParallelSize = 8;
95 
96         enum Int16Subtype
97         {
98             IntSubtype_Signed,
99             IntSubtype_UnsignedFull,
100             IntSubtype_UnsignedTruncated,
101             IntSubtype_Abstract,
102         };
103 
104         template<int TSubtype>
105         struct VInt16
106         {
107             __m128i m_value;
108 
operator +cvtt::ParallelMath::VInt16109             inline VInt16 operator+(int16_t other) const
110             {
111                 VInt16 result;
112                 result.m_value = _mm_add_epi16(m_value, _mm_set1_epi16(static_cast<int16_t>(other)));
113                 return result;
114             }
115 
operator +cvtt::ParallelMath::VInt16116             inline VInt16 operator+(const VInt16 &other) const
117             {
118                 VInt16 result;
119                 result.m_value = _mm_add_epi16(m_value, other.m_value);
120                 return result;
121             }
122 
operator |cvtt::ParallelMath::VInt16123             inline VInt16 operator|(const VInt16 &other) const
124             {
125                 VInt16 result;
126                 result.m_value = _mm_or_si128(m_value, other.m_value);
127                 return result;
128             }
129 
operator &cvtt::ParallelMath::VInt16130             inline VInt16 operator&(const VInt16 &other) const
131             {
132                 VInt16 result;
133                 result.m_value = _mm_and_si128(m_value, other.m_value);
134                 return result;
135             }
136 
operator -cvtt::ParallelMath::VInt16137             inline VInt16 operator-(const VInt16 &other) const
138             {
139                 VInt16 result;
140                 result.m_value = _mm_sub_epi16(m_value, other.m_value);
141                 return result;
142             }
143 
operator <<cvtt::ParallelMath::VInt16144             inline VInt16 operator<<(int bits) const
145             {
146                 VInt16 result;
147                 result.m_value = _mm_slli_epi16(m_value, bits);
148                 return result;
149             }
150         };
151 
152         typedef VInt16<IntSubtype_Signed> SInt16;
153         typedef VInt16<IntSubtype_UnsignedFull> UInt16;
154         typedef VInt16<IntSubtype_UnsignedTruncated> UInt15;
155         typedef VInt16<IntSubtype_Abstract> AInt16;
156 
157         template<int TSubtype>
158         struct VInt32
159         {
160             __m128i m_values[2];
161 
operator +cvtt::ParallelMath::VInt32162             inline VInt32 operator+(const VInt32& other) const
163             {
164                 VInt32 result;
165                 result.m_values[0] = _mm_add_epi32(m_values[0], other.m_values[0]);
166                 result.m_values[1] = _mm_add_epi32(m_values[1], other.m_values[1]);
167                 return result;
168             }
169 
operator -cvtt::ParallelMath::VInt32170             inline VInt32 operator-(const VInt32& other) const
171             {
172                 VInt32 result;
173                 result.m_values[0] = _mm_sub_epi32(m_values[0], other.m_values[0]);
174                 result.m_values[1] = _mm_sub_epi32(m_values[1], other.m_values[1]);
175                 return result;
176             }
177 
operator <<cvtt::ParallelMath::VInt32178             inline VInt32 operator<<(const int other) const
179             {
180                 VInt32 result;
181                 result.m_values[0] = _mm_slli_epi32(m_values[0], other);
182                 result.m_values[1] = _mm_slli_epi32(m_values[1], other);
183                 return result;
184             }
185         };
186 
187         typedef VInt32<IntSubtype_Signed> SInt32;
188         typedef VInt32<IntSubtype_UnsignedTruncated> UInt31;
189         typedef VInt32<IntSubtype_UnsignedFull> UInt32;
190         typedef VInt32<IntSubtype_Abstract> AInt32;
191 
192         template<class TTargetType>
193         struct LosslessCast
194         {
195 #ifdef CVTT_PERMIT_ALIASING
196             template<int TSrcSubtype>
Castcvtt::ParallelMath::LosslessCast197             static const TTargetType& Cast(const VInt32<TSrcSubtype> &src)
198             {
199                 return reinterpret_cast<VInt32<TSubtype>&>(src);
200             }
201 
202             template<int TSrcSubtype>
Castcvtt::ParallelMath::LosslessCast203             static const TTargetType& Cast(const VInt16<TSrcSubtype> &src)
204             {
205                 return reinterpret_cast<VInt16<TSubtype>&>(src);
206             }
207 #else
208             template<int TSrcSubtype>
209             static TTargetType Cast(const VInt32<TSrcSubtype> &src)
210             {
211                 TTargetType result;
212                 result.m_values[0] = src.m_values[0];
213                 result.m_values[1] = src.m_values[1];
214                 return result;
215             }
216 
217             template<int TSrcSubtype>
218             static TTargetType Cast(const VInt16<TSrcSubtype> &src)
219             {
220                 TTargetType result;
221                 result.m_value = src.m_value;
222                 return result;
223             }
224 #endif
225         };
226 
227         struct Int64
228         {
229             __m128i m_values[4];
230         };
231 
232         struct Float
233         {
234             __m128 m_values[2];
235 
operator +cvtt::ParallelMath::Float236             inline Float operator+(const Float &other) const
237             {
238                 Float result;
239                 result.m_values[0] = _mm_add_ps(m_values[0], other.m_values[0]);
240                 result.m_values[1] = _mm_add_ps(m_values[1], other.m_values[1]);
241                 return result;
242             }
243 
operator +cvtt::ParallelMath::Float244             inline Float operator+(float other) const
245             {
246                 Float result;
247                 result.m_values[0] = _mm_add_ps(m_values[0], _mm_set1_ps(other));
248                 result.m_values[1] = _mm_add_ps(m_values[1], _mm_set1_ps(other));
249                 return result;
250             }
251 
operator -cvtt::ParallelMath::Float252             inline Float operator-(const Float& other) const
253             {
254                 Float result;
255                 result.m_values[0] = _mm_sub_ps(m_values[0], other.m_values[0]);
256                 result.m_values[1] = _mm_sub_ps(m_values[1], other.m_values[1]);
257                 return result;
258             }
259 
operator -cvtt::ParallelMath::Float260             inline Float operator-() const
261             {
262                 Float result;
263                 result.m_values[0] = _mm_sub_ps(_mm_setzero_ps(), m_values[0]);
264                 result.m_values[1] = _mm_sub_ps(_mm_setzero_ps(), m_values[1]);
265                 return result;
266             }
267 
operator *cvtt::ParallelMath::Float268             inline Float operator*(const Float& other) const
269             {
270                 Float result;
271                 result.m_values[0] = _mm_mul_ps(m_values[0], other.m_values[0]);
272                 result.m_values[1] = _mm_mul_ps(m_values[1], other.m_values[1]);
273                 return result;
274             }
275 
operator *cvtt::ParallelMath::Float276             inline Float operator*(float other) const
277             {
278                 Float result;
279                 result.m_values[0] = _mm_mul_ps(m_values[0], _mm_set1_ps(other));
280                 result.m_values[1] = _mm_mul_ps(m_values[1], _mm_set1_ps(other));
281                 return result;
282             }
283 
operator /cvtt::ParallelMath::Float284             inline Float operator/(const Float &other) const
285             {
286                 Float result;
287                 result.m_values[0] = _mm_div_ps(m_values[0], other.m_values[0]);
288                 result.m_values[1] = _mm_div_ps(m_values[1], other.m_values[1]);
289                 return result;
290             }
291 
operator /cvtt::ParallelMath::Float292             inline Float operator/(float other) const
293             {
294                 Float result;
295                 result.m_values[0] = _mm_div_ps(m_values[0], _mm_set1_ps(other));
296                 result.m_values[1] = _mm_div_ps(m_values[1], _mm_set1_ps(other));
297                 return result;
298             }
299         };
300 
301         struct Int16CompFlag
302         {
303             __m128i m_value;
304 
operator &cvtt::ParallelMath::Int16CompFlag305             inline Int16CompFlag operator&(const Int16CompFlag &other) const
306             {
307                 Int16CompFlag result;
308                 result.m_value = _mm_and_si128(m_value, other.m_value);
309                 return result;
310             }
311 
operator |cvtt::ParallelMath::Int16CompFlag312             inline Int16CompFlag operator|(const Int16CompFlag &other) const
313             {
314                 Int16CompFlag result;
315                 result.m_value = _mm_or_si128(m_value, other.m_value);
316                 return result;
317             }
318         };
319 
320         struct FloatCompFlag
321         {
322             __m128 m_values[2];
323         };
324 
325         template<int TSubtype>
AbstractAddcvtt::ParallelMath326         static VInt16<TSubtype> AbstractAdd(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
327         {
328             VInt16<TSubtype> result;
329             result.m_value = _mm_add_epi16(a.m_value, b.m_value);
330             return result;
331         }
332 
333         template<int TSubtype>
AbstractSubtractcvtt::ParallelMath334         static VInt16<TSubtype> AbstractSubtract(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
335         {
336             VInt16<TSubtype> result;
337             result.m_value = _mm_sub_epi16(a.m_value, b.m_value);
338             return result;
339         }
340 
Selectcvtt::ParallelMath341         static Float Select(const FloatCompFlag &flag, const Float &a, const Float &b)
342         {
343             Float result;
344             for (int i = 0; i < 2; i++)
345                 result.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], a.m_values[i]), _mm_andnot_ps(flag.m_values[i], b.m_values[i]));
346             return result;
347         }
348 
349         template<int TSubtype>
Selectcvtt::ParallelMath350         static VInt16<TSubtype> Select(const Int16CompFlag &flag, const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
351         {
352             VInt16<TSubtype> result;
353             result.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, a.m_value), _mm_andnot_si128(flag.m_value, b.m_value));
354             return result;
355         }
356 
357         template<int TSubtype>
SelectOrZerocvtt::ParallelMath358         static VInt16<TSubtype> SelectOrZero(const Int16CompFlag &flag, const VInt16<TSubtype> &a)
359         {
360             VInt16<TSubtype> result;
361             result.m_value = _mm_and_si128(flag.m_value, a.m_value);
362             return result;
363         }
364 
365         template<int TSubtype>
ConditionalSetcvtt::ParallelMath366         static void ConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
367         {
368             dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
369         }
370 
ConditionalNegatecvtt::ParallelMath371         static SInt16 ConditionalNegate(const Int16CompFlag &flag, const SInt16 &v)
372         {
373             SInt16 result;
374             result.m_value = _mm_add_epi16(_mm_xor_si128(flag.m_value, v.m_value), _mm_srli_epi16(flag.m_value, 15));
375             return result;
376         }
377 
378         template<int TSubtype>
NotConditionalSetcvtt::ParallelMath379         static void NotConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
380         {
381             dest.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, dest.m_value), _mm_andnot_si128(flag.m_value, src.m_value));
382         }
383 
ConditionalSetcvtt::ParallelMath384         static void ConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
385         {
386             for (int i = 0; i < 2; i++)
387                 dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i]));
388         }
389 
NotConditionalSetcvtt::ParallelMath390         static void NotConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
391         {
392             for (int i = 0; i < 2; i++)
393                 dest.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], dest.m_values[i]), _mm_andnot_ps(flag.m_values[i], src.m_values[i]));
394         }
395 
MakeSafeDenominatorcvtt::ParallelMath396         static void MakeSafeDenominator(Float& v)
397         {
398             ConditionalSet(v, Equal(v, MakeFloatZero()), MakeFloat(1.0f));
399         }
400 
TruncateToPrecisionSignedcvtt::ParallelMath401         static SInt16 TruncateToPrecisionSigned(const SInt16 &v, int precision)
402         {
403             int lostBits = 16 - precision;
404             if (lostBits == 0)
405                 return v;
406 
407             SInt16 result;
408             result.m_value = _mm_srai_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
409             return result;
410         }
411 
TruncateToPrecisionUnsignedcvtt::ParallelMath412         static UInt16 TruncateToPrecisionUnsigned(const UInt16 &v, int precision)
413         {
414             int lostBits = 16 - precision;
415             if (lostBits == 0)
416                 return v;
417 
418             UInt16 result;
419             result.m_value = _mm_srli_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
420             return result;
421         }
422 
Mincvtt::ParallelMath423         static UInt16 Min(const UInt16 &a, const UInt16 &b)
424         {
425             __m128i bitFlip = _mm_set1_epi16(-32768);
426 
427             UInt16 result;
428             result.m_value = _mm_xor_si128(_mm_min_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
429             return result;
430         }
431 
Mincvtt::ParallelMath432         static SInt16 Min(const SInt16 &a, const SInt16 &b)
433         {
434             SInt16 result;
435             result.m_value = _mm_min_epi16(a.m_value, b.m_value);
436             return result;
437         }
438 
Mincvtt::ParallelMath439         static UInt15 Min(const UInt15 &a, const UInt15 &b)
440         {
441             UInt15 result;
442             result.m_value = _mm_min_epi16(a.m_value, b.m_value);
443             return result;
444         }
445 
Mincvtt::ParallelMath446         static Float Min(const Float &a, const Float &b)
447         {
448             Float result;
449             for (int i = 0; i < 2; i++)
450                 result.m_values[i] = _mm_min_ps(a.m_values[i], b.m_values[i]);
451             return result;
452         }
453 
Maxcvtt::ParallelMath454         static UInt16 Max(const UInt16 &a, const UInt16 &b)
455         {
456             __m128i bitFlip = _mm_set1_epi16(-32768);
457 
458             UInt16 result;
459             result.m_value = _mm_xor_si128(_mm_max_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
460             return result;
461         }
462 
Maxcvtt::ParallelMath463         static SInt16 Max(const SInt16 &a, const SInt16 &b)
464         {
465             SInt16 result;
466             result.m_value = _mm_max_epi16(a.m_value, b.m_value);
467             return result;
468         }
469 
Maxcvtt::ParallelMath470         static UInt15 Max(const UInt15 &a, const UInt15 &b)
471         {
472             UInt15 result;
473             result.m_value = _mm_max_epi16(a.m_value, b.m_value);
474             return result;
475         }
476 
Maxcvtt::ParallelMath477         static Float Max(const Float &a, const Float &b)
478         {
479             Float result;
480             for (int i = 0; i < 2; i++)
481                 result.m_values[i] = _mm_max_ps(a.m_values[i], b.m_values[i]);
482             return result;
483         }
484 
Clampcvtt::ParallelMath485         static Float Clamp(const Float &v, float min, float max)
486         {
487             Float result;
488             for (int i = 0; i < 2; i++)
489                 result.m_values[i] = _mm_max_ps(_mm_min_ps(v.m_values[i], _mm_set1_ps(max)), _mm_set1_ps(min));
490             return result;
491         }
492 
Reciprocalcvtt::ParallelMath493         static Float Reciprocal(const Float &v)
494         {
495             Float result;
496             for (int i = 0; i < 2; i++)
497                 result.m_values[i] = _mm_rcp_ps(v.m_values[i]);
498             return result;
499         }
500 
ConvertLDRInputscvtt::ParallelMath501         static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, UInt15 &chOut)
502         {
503             int16_t values[8];
504             for (int i = 0; i < 8; i++)
505                 values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
506 
507             chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
508         }
509 
ConvertHDRInputscvtt::ParallelMath510         static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, SInt16 &chOut)
511         {
512             int16_t values[8];
513             for (int i = 0; i < 8; i++)
514                 values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
515 
516             chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
517         }
518 
MakeFloatcvtt::ParallelMath519         static Float MakeFloat(float v)
520         {
521             Float f;
522             f.m_values[0] = f.m_values[1] = _mm_set1_ps(v);
523             return f;
524         }
525 
MakeFloatZerocvtt::ParallelMath526         static Float MakeFloatZero()
527         {
528             Float f;
529             f.m_values[0] = f.m_values[1] = _mm_setzero_ps();
530             return f;
531         }
532 
MakeUInt16cvtt::ParallelMath533         static UInt16 MakeUInt16(uint16_t v)
534         {
535             UInt16 result;
536             result.m_value = _mm_set1_epi16(static_cast<short>(v));
537             return result;
538         }
539 
MakeSInt16cvtt::ParallelMath540         static SInt16 MakeSInt16(int16_t v)
541         {
542             SInt16 result;
543             result.m_value = _mm_set1_epi16(static_cast<short>(v));
544             return result;
545         }
546 
MakeAInt16cvtt::ParallelMath547         static AInt16 MakeAInt16(int16_t v)
548         {
549             AInt16 result;
550             result.m_value = _mm_set1_epi16(static_cast<short>(v));
551             return result;
552         }
553 
MakeUInt15cvtt::ParallelMath554         static UInt15 MakeUInt15(uint16_t v)
555         {
556             UInt15 result;
557             result.m_value = _mm_set1_epi16(static_cast<short>(v));
558             return result;
559         }
560 
MakeSInt32cvtt::ParallelMath561         static SInt32 MakeSInt32(int32_t v)
562         {
563             SInt32 result;
564             result.m_values[0] = _mm_set1_epi32(v);
565             result.m_values[1] = _mm_set1_epi32(v);
566             return result;
567         }
568 
MakeUInt31cvtt::ParallelMath569         static UInt31 MakeUInt31(uint32_t v)
570         {
571             UInt31 result;
572             result.m_values[0] = _mm_set1_epi32(v);
573             result.m_values[1] = _mm_set1_epi32(v);
574             return result;
575         }
576 
Extractcvtt::ParallelMath577         static uint16_t Extract(const UInt16 &v, int offset)
578         {
579             return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
580         }
581 
Extractcvtt::ParallelMath582         static int16_t Extract(const SInt16 &v, int offset)
583         {
584             return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
585         }
586 
Extractcvtt::ParallelMath587         static uint16_t Extract(const UInt15 &v, int offset)
588         {
589             return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
590         }
591 
Extractcvtt::ParallelMath592         static int16_t Extract(const AInt16 &v, int offset)
593         {
594             return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
595         }
596 
PutUInt16cvtt::ParallelMath597         static void PutUInt16(UInt16 &dest, int offset, uint16_t v)
598         {
599             reinterpret_cast<uint16_t*>(&dest)[offset] = v;
600         }
601 
PutUInt15cvtt::ParallelMath602         static void PutUInt15(UInt15 &dest, int offset, uint16_t v)
603         {
604             reinterpret_cast<uint16_t*>(&dest)[offset] = v;
605         }
606 
PutSInt16cvtt::ParallelMath607         static void PutSInt16(SInt16 &dest, int offset, int16_t v)
608         {
609             reinterpret_cast<int16_t*>(&dest)[offset] = v;
610         }
611 
ExtractFloatcvtt::ParallelMath612         static float ExtractFloat(const Float& v, int offset)
613         {
614             return reinterpret_cast<const float*>(&v)[offset];
615         }
616 
PutFloatcvtt::ParallelMath617         static void PutFloat(Float &dest, int offset, float v)
618         {
619             reinterpret_cast<float*>(&dest)[offset] = v;
620         }
621 
Lesscvtt::ParallelMath622         static Int16CompFlag Less(const SInt16 &a, const SInt16 &b)
623         {
624             Int16CompFlag result;
625             result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
626             return result;
627         }
628 
Lesscvtt::ParallelMath629         static Int16CompFlag Less(const UInt15 &a, const UInt15 &b)
630         {
631             Int16CompFlag result;
632             result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
633             return result;
634         }
635 
LessOrEqualcvtt::ParallelMath636         static Int16CompFlag LessOrEqual(const UInt15 &a, const UInt15 &b)
637         {
638             Int16CompFlag result;
639             result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
640             return result;
641         }
642 
Lesscvtt::ParallelMath643         static FloatCompFlag Less(const Float &a, const Float &b)
644         {
645             FloatCompFlag result;
646             for (int i = 0; i < 2; i++)
647                 result.m_values[i] = _mm_cmplt_ps(a.m_values[i], b.m_values[i]);
648             return result;
649         }
650 
LessOrEqualcvtt::ParallelMath651         static FloatCompFlag LessOrEqual(const Float &a, const Float &b)
652         {
653             FloatCompFlag result;
654             for (int i = 0; i < 2; i++)
655                 result.m_values[i] = _mm_cmple_ps(a.m_values[i], b.m_values[i]);
656             return result;
657         }
658 
659         template<int TSubtype>
Equalcvtt::ParallelMath660         static Int16CompFlag Equal(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
661         {
662             Int16CompFlag result;
663             result.m_value = _mm_cmpeq_epi16(a.m_value, b.m_value);
664             return result;
665         }
666 
Equalcvtt::ParallelMath667         static FloatCompFlag Equal(const Float &a, const Float &b)
668         {
669             FloatCompFlag result;
670             for (int i = 0; i < 2; i++)
671                 result.m_values[i] = _mm_cmpeq_ps(a.m_values[i], b.m_values[i]);
672             return result;
673         }
674 
ToFloatcvtt::ParallelMath675         static Float ToFloat(const UInt16 &v)
676         {
677             Float result;
678             result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
679             result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
680             return result;
681         }
682 
ToUInt31cvtt::ParallelMath683         static UInt31 ToUInt31(const UInt16 &v)
684         {
685             UInt31 result;
686             result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
687             result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
688             return result;
689         }
690 
ToInt32cvtt::ParallelMath691         static SInt32 ToInt32(const UInt16 &v)
692         {
693             SInt32 result;
694             result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
695             result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
696             return result;
697         }
698 
ToInt32cvtt::ParallelMath699         static SInt32 ToInt32(const SInt16 &v)
700         {
701             SInt32 result;
702             result.m_values[0] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16);
703             result.m_values[1] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16);
704             return result;
705         }
706 
ToFloatcvtt::ParallelMath707         static Float ToFloat(const SInt16 &v)
708         {
709             Float result;
710             result.m_values[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16));
711             result.m_values[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16));
712             return result;
713         }
714 
ToFloatcvtt::ParallelMath715         static Float ToFloat(const UInt15 &v)
716         {
717             Float result;
718             result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
719             result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
720             return result;
721         }
722 
ToFloatcvtt::ParallelMath723         static Float ToFloat(const UInt31 &v)
724         {
725             Float result;
726             result.m_values[0] = _mm_cvtepi32_ps(v.m_values[0]);
727             result.m_values[1] = _mm_cvtepi32_ps(v.m_values[1]);
728             return result;
729         }
730 
FloatFlagToInt16cvtt::ParallelMath731         static Int16CompFlag FloatFlagToInt16(const FloatCompFlag &v)
732         {
733             __m128i lo = _mm_castps_si128(v.m_values[0]);
734             __m128i hi = _mm_castps_si128(v.m_values[1]);
735 
736             Int16CompFlag result;
737             result.m_value = _mm_packs_epi32(lo, hi);
738             return result;
739         }
740 
Int16FlagToFloatcvtt::ParallelMath741         static FloatCompFlag Int16FlagToFloat(const Int16CompFlag &v)
742         {
743             __m128i lo = _mm_unpacklo_epi16(v.m_value, v.m_value);
744             __m128i hi = _mm_unpackhi_epi16(v.m_value, v.m_value);
745 
746             FloatCompFlag result;
747             result.m_values[0] = _mm_castsi128_ps(lo);
748             result.m_values[1] = _mm_castsi128_ps(hi);
749             return result;
750         }
751 
MakeBoolInt16cvtt::ParallelMath752         static Int16CompFlag MakeBoolInt16(bool b)
753         {
754             Int16CompFlag result;
755             if (b)
756                 result.m_value = _mm_set1_epi16(-1);
757             else
758                 result.m_value = _mm_setzero_si128();
759             return result;
760         }
761 
MakeBoolFloatcvtt::ParallelMath762         static FloatCompFlag MakeBoolFloat(bool b)
763         {
764             FloatCompFlag result;
765             if (b)
766                 result.m_values[0] = result.m_values[1] = _mm_castsi128_ps(_mm_set1_epi32(-1));
767             else
768                 result.m_values[0] = result.m_values[1] = _mm_setzero_ps();
769             return result;
770         }
771 
AndNotcvtt::ParallelMath772         static Int16CompFlag AndNot(const Int16CompFlag &a, const Int16CompFlag &b)
773         {
774             Int16CompFlag result;
775             result.m_value = _mm_andnot_si128(b.m_value, a.m_value);
776             return result;
777         }
778 
RoundAndConvertToU16cvtt::ParallelMath779         static UInt16 RoundAndConvertToU16(const Float &v, const void* /*roundingMode*/)
780         {
781             __m128i lo = _mm_cvtps_epi32(_mm_add_ps(v.m_values[0], _mm_set1_ps(-32768)));
782             __m128i hi = _mm_cvtps_epi32(_mm_add_ps(v.m_values[1], _mm_set1_ps(-32768)));
783 
784             __m128i packed = _mm_packs_epi32(lo, hi);
785 
786             UInt16 result;
787             result.m_value = _mm_xor_si128(packed, _mm_set1_epi16(-32768));
788             return result;
789         }
790 
RoundAndConvertToU15cvtt::ParallelMath791         static UInt15 RoundAndConvertToU15(const Float &v, const void* /*roundingMode*/)
792         {
793             __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
794             __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
795 
796             __m128i packed = _mm_packs_epi32(lo, hi);
797 
798             UInt15 result;
799             result.m_value = _mm_packs_epi32(lo, hi);
800             return result;
801         }
802 
RoundAndConvertToS16cvtt::ParallelMath803         static SInt16 RoundAndConvertToS16(const Float &v, const void* /*roundingMode*/)
804         {
805             __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
806             __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
807 
808             __m128i packed = _mm_packs_epi32(lo, hi);
809 
810             SInt16 result;
811             result.m_value = _mm_packs_epi32(lo, hi);
812             return result;
813         }
814 
Sqrtcvtt::ParallelMath815         static Float Sqrt(const Float &f)
816         {
817             Float result;
818             for (int i = 0; i < 2; i++)
819                 result.m_values[i] = _mm_sqrt_ps(f.m_values[i]);
820             return result;
821         }
822 
Abscvtt::ParallelMath823         static UInt16 Abs(const SInt16 &a)
824         {
825             __m128i signBitsXor = _mm_srai_epi16(a.m_value, 15);
826             __m128i signBitsAdd = _mm_srli_epi16(a.m_value, 15);
827 
828             UInt16 result;
829             result.m_value = _mm_add_epi16(_mm_xor_si128(a.m_value, signBitsXor), signBitsAdd);
830             return result;
831         }
832 
Abscvtt::ParallelMath833         static Float Abs(const Float& a)
834         {
835             __m128 invMask = _mm_set1_ps(-0.0f);
836 
837             Float result;
838             result.m_values[0] = _mm_andnot_ps(invMask, a.m_values[0]);
839             result.m_values[1] = _mm_andnot_ps(invMask, a.m_values[1]);
840             return result;
841         }
842 
SqDiffUInt8cvtt::ParallelMath843         static UInt16 SqDiffUInt8(const UInt15 &a, const UInt15 &b)
844         {
845             __m128i diff = _mm_sub_epi16(a.m_value, b.m_value);
846 
847             UInt16 result;
848             result.m_value = _mm_mullo_epi16(diff, diff);
849             return result;
850         }
851 
SqDiffSInt16cvtt::ParallelMath852         static Float SqDiffSInt16(const SInt16 &a, const SInt16 &b)
853         {
854             __m128i diffU = _mm_sub_epi16(_mm_max_epi16(a.m_value, b.m_value), _mm_min_epi16(a.m_value, b.m_value));
855 
856             __m128i mulHi = _mm_mulhi_epu16(diffU, diffU);
857             __m128i mulLo = _mm_mullo_epi16(diffU, diffU);
858             __m128i sqDiffHi = _mm_unpackhi_epi16(mulLo, mulHi);
859             __m128i sqDiffLo = _mm_unpacklo_epi16(mulLo, mulHi);
860 
861             Float result;
862             result.m_values[0] = _mm_cvtepi32_ps(sqDiffLo);
863             result.m_values[1] = _mm_cvtepi32_ps(sqDiffHi);
864 
865             return result;
866         }
867 
TwosCLHalfToFloatcvtt::ParallelMath868         static Float TwosCLHalfToFloat(const SInt16 &v)
869         {
870             __m128i absV = _mm_add_epi16(_mm_xor_si128(v.m_value, _mm_srai_epi16(v.m_value, 15)), _mm_srli_epi16(v.m_value, 15));
871 
872             __m128i signBits = _mm_and_si128(v.m_value, _mm_set1_epi16(-32768));
873             __m128i mantissa = _mm_and_si128(v.m_value, _mm_set1_epi16(0x03ff));
874             __m128i exponent = _mm_and_si128(v.m_value, _mm_set1_epi16(0x7c00));
875 
876             __m128i isDenormal = _mm_cmpeq_epi16(exponent, _mm_setzero_si128());
877 
878             // Convert exponent to high-bits
879             exponent = _mm_add_epi16(_mm_srli_epi16(exponent, 3), _mm_set1_epi16(14336));
880 
881             __m128i denormalCorrectionHigh = _mm_and_si128(isDenormal, _mm_or_si128(signBits, _mm_set1_epi16(14336)));
882 
883             __m128i highBits = _mm_or_si128(signBits, _mm_or_si128(exponent, _mm_srli_epi16(mantissa, 3)));
884             __m128i lowBits = _mm_slli_epi16(mantissa, 13);
885 
886             __m128i flow = _mm_unpacklo_epi16(lowBits, highBits);
887             __m128i fhigh = _mm_unpackhi_epi16(lowBits, highBits);
888 
889             __m128i correctionLow = _mm_unpacklo_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
890             __m128i correctionHigh = _mm_unpackhi_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
891 
892             Float result;
893             result.m_values[0] = _mm_sub_ps(_mm_castsi128_ps(flow), _mm_castsi128_ps(correctionLow));
894             result.m_values[1] = _mm_sub_ps(_mm_castsi128_ps(fhigh), _mm_castsi128_ps(correctionHigh));
895 
896             return result;
897         }
898 
SqDiff2CLFloatcvtt::ParallelMath899         static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
900         {
901             Float fa = TwosCLHalfToFloat(a);
902 
903             Float diff = fa - b;
904             return diff * diff;
905         }
906 
SqDiff2CLcvtt::ParallelMath907         static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
908         {
909             Float fa = TwosCLHalfToFloat(a);
910             Float fb = TwosCLHalfToFloat(b);
911 
912             Float diff = fa - fb;
913             return diff * diff;
914         }
915 
SqDiff2CLFloatcvtt::ParallelMath916         static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
917         {
918             Float fa = TwosCLHalfToFloat(a) * aWeight;
919 
920             Float diff = fa - b;
921             return diff * diff;
922         }
923 
RightShiftcvtt::ParallelMath924         static UInt16 RightShift(const UInt16 &v, int bits)
925         {
926             UInt16 result;
927             result.m_value = _mm_srli_epi16(v.m_value, bits);
928             return result;
929         }
930 
RightShiftcvtt::ParallelMath931         static UInt31 RightShift(const UInt31 &v, int bits)
932         {
933             UInt31 result;
934             result.m_values[0] = _mm_srli_epi32(v.m_values[0], bits);
935             result.m_values[1] = _mm_srli_epi32(v.m_values[1], bits);
936             return result;
937         }
938 
RightShiftcvtt::ParallelMath939         static SInt16 RightShift(const SInt16 &v, int bits)
940         {
941             SInt16 result;
942             result.m_value = _mm_srai_epi16(v.m_value, bits);
943             return result;
944         }
945 
RightShiftcvtt::ParallelMath946         static UInt15 RightShift(const UInt15 &v, int bits)
947         {
948             UInt15 result;
949             result.m_value = _mm_srli_epi16(v.m_value, bits);
950             return result;
951         }
952 
RightShiftcvtt::ParallelMath953         static SInt32 RightShift(const SInt32 &v, int bits)
954         {
955             SInt32 result;
956             result.m_values[0] = _mm_srai_epi32(v.m_values[0], bits);
957             result.m_values[1] = _mm_srai_epi32(v.m_values[1], bits);
958             return result;
959         }
960 
ToSInt16cvtt::ParallelMath961         static SInt16 ToSInt16(const SInt32 &v)
962         {
963             SInt16 result;
964             result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
965             return result;
966         }
967 
ToUInt16cvtt::ParallelMath968         static UInt16 ToUInt16(const UInt32 &v)
969         {
970             __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
971             __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
972 
973             UInt16 result;
974             result.m_value = _mm_packs_epi32(low, high);
975             return result;
976         }
977 
ToUInt16cvtt::ParallelMath978         static UInt16 ToUInt16(const UInt31 &v)
979         {
980             __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
981             __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
982 
983             UInt16 result;
984             result.m_value = _mm_packs_epi32(low, high);
985             return result;
986         }
987 
ToUInt15cvtt::ParallelMath988         static UInt15 ToUInt15(const UInt31 &v)
989         {
990             UInt15 result;
991             result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
992             return result;
993         }
994 
XMultiplycvtt::ParallelMath995         static SInt32 XMultiply(const SInt16 &a, const SInt16 &b)
996         {
997             __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
998             __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
999 
1000             SInt32 result;
1001             result.m_values[0] = _mm_unpacklo_epi16(low, high);
1002             result.m_values[1] = _mm_unpackhi_epi16(low, high);
1003             return result;
1004         }
1005 
XMultiplycvtt::ParallelMath1006         static SInt32 XMultiply(const SInt16 &a, const UInt15 &b)
1007         {
1008             __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
1009             __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1010 
1011             SInt32 result;
1012             result.m_values[0] = _mm_unpacklo_epi16(low, high);
1013             result.m_values[1] = _mm_unpackhi_epi16(low, high);
1014             return result;
1015         }
1016 
XMultiplycvtt::ParallelMath1017         static SInt32 XMultiply(const UInt15 &a, const SInt16 &b)
1018         {
1019             return XMultiply(b, a);
1020         }
1021 
XMultiplycvtt::ParallelMath1022         static UInt32 XMultiply(const UInt16 &a, const UInt16 &b)
1023         {
1024             __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
1025             __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1026 
1027             UInt32 result;
1028             result.m_values[0] = _mm_unpacklo_epi16(low, high);
1029             result.m_values[1] = _mm_unpackhi_epi16(low, high);
1030             return result;
1031         }
1032 
CompactMultiplycvtt::ParallelMath1033         static UInt16 CompactMultiply(const UInt16 &a, const UInt15 &b)
1034         {
1035             UInt16 result;
1036             result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
1037             return result;
1038         }
1039 
CompactMultiplycvtt::ParallelMath1040         static UInt16 CompactMultiply(const UInt15 &a, const UInt15 &b)
1041         {
1042             UInt16 result;
1043             result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
1044             return result;
1045         }
1046 
XMultiplycvtt::ParallelMath1047         static UInt31 XMultiply(const UInt15 &a, const UInt15 &b)
1048         {
1049             __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
1050             __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1051 
1052             UInt31 result;
1053             result.m_values[0] = _mm_unpacklo_epi16(low, high);
1054             result.m_values[1] = _mm_unpackhi_epi16(low, high);
1055             return result;
1056         }
1057 
XMultiplycvtt::ParallelMath1058         static UInt31 XMultiply(const UInt16 &a, const UInt15 &b)
1059         {
1060             __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
1061             __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1062 
1063             UInt31 result;
1064             result.m_values[0] = _mm_unpacklo_epi16(low, high);
1065             result.m_values[1] = _mm_unpackhi_epi16(low, high);
1066             return result;
1067         }
1068 
XMultiplycvtt::ParallelMath1069         static UInt31 XMultiply(const UInt15 &a, const UInt16 &b)
1070         {
1071             return XMultiply(b, a);
1072         }
1073 
AnySetcvtt::ParallelMath1074         static bool AnySet(const Int16CompFlag &v)
1075         {
1076             return _mm_movemask_epi8(v.m_value) != 0;
1077         }
1078 
AllSetcvtt::ParallelMath1079         static bool AllSet(const Int16CompFlag &v)
1080         {
1081             return _mm_movemask_epi8(v.m_value) == 0xffff;
1082         }
1083 
AnySetcvtt::ParallelMath1084         static bool AnySet(const FloatCompFlag &v)
1085         {
1086             return _mm_movemask_ps(v.m_values[0]) != 0 || _mm_movemask_ps(v.m_values[1]) != 0;
1087         }
1088 
AllSetcvtt::ParallelMath1089         static bool AllSet(const FloatCompFlag &v)
1090         {
1091             return _mm_movemask_ps(v.m_values[0]) == 0xf && _mm_movemask_ps(v.m_values[1]) == 0xf;
1092         }
1093     };
1094 
1095 #else
1096     // Scalar version
1097     struct ParallelMath
1098     {
1099         struct RoundTowardZeroForScope
1100         {
1101         };
1102 
1103         struct RoundTowardNearestForScope
1104         {
1105         };
1106 
1107         struct RoundUpForScope
1108         {
1109         };
1110 
1111         struct RoundDownForScope
1112         {
1113         };
1114 
1115         static const int ParallelSize = 1;
1116 
1117         enum Int16Subtype
1118         {
1119             IntSubtype_Signed,
1120             IntSubtype_UnsignedFull,
1121             IntSubtype_UnsignedTruncated,
1122             IntSubtype_Abstract,
1123         };
1124 
1125         typedef int32_t SInt16;
1126         typedef int32_t UInt15;
1127         typedef int32_t UInt16;
1128         typedef int32_t AInt16;
1129 
1130         typedef int32_t SInt32;
1131         typedef int32_t UInt31;
1132         typedef int32_t UInt32;
1133         typedef int32_t AInt32;
1134 
1135         typedef int32_t ScalarUInt16;
1136         typedef int32_t ScalarSInt16;
1137 
1138         typedef float Float;
1139 
1140         template<class TTargetType>
1141         struct LosslessCast
1142         {
1143             static const int32_t& Cast(const int32_t &src)
1144             {
1145                 return src;
1146             }
1147         };
1148 
1149         typedef bool Int16CompFlag;
1150         typedef bool FloatCompFlag;
1151 
1152         static int32_t AbstractAdd(const int32_t &a, const int32_t &b)
1153         {
1154             return a + b;
1155         }
1156 
1157         static int32_t AbstractSubtract(const int32_t &a, const int32_t &b)
1158         {
1159             return a - b;
1160         }
1161 
1162         static float Select(bool flag, float a, float b)
1163         {
1164             return flag ? a : b;
1165         }
1166 
1167         static int32_t Select(bool flag, int32_t a, int32_t b)
1168         {
1169             return flag ? a : b;
1170         }
1171 
1172         static int32_t SelectOrZero(bool flag, int32_t a)
1173         {
1174             return flag ? a : 0;
1175         }
1176 
1177         static void ConditionalSet(int32_t& dest, bool flag, int32_t src)
1178         {
1179             if (flag)
1180                 dest = src;
1181         }
1182 
1183         static int32_t ConditionalNegate(bool flag, int32_t v)
1184         {
1185             return (flag) ? -v : v;
1186         }
1187 
1188         static void NotConditionalSet(int32_t& dest, bool flag, int32_t src)
1189         {
1190             if (!flag)
1191                 dest = src;
1192         }
1193 
1194         static void ConditionalSet(float& dest, bool flag, float src)
1195         {
1196             if (flag)
1197                 dest = src;
1198         }
1199 
1200         static void NotConditionalSet(float& dest, bool flag, float src)
1201         {
1202             if (!flag)
1203                 dest = src;
1204         }
1205 
1206         static void MakeSafeDenominator(float& v)
1207         {
1208             if (v == 0.0f)
1209                 v = 1.0f;
1210         }
1211 
1212         static int32_t SignedRightShift(int32_t v, int bits)
1213         {
1214             return v >> bits;
1215         }
1216 
1217         static int32_t TruncateToPrecisionSigned(int32_t v, int precision)
1218         {
1219             v = (v << (32 - precision)) & 0xffffffff;
1220             return SignedRightShift(v, 32 - precision);
1221         }
1222 
1223         static int32_t TruncateToPrecisionUnsigned(int32_t v, int precision)
1224         {
1225             return v & ((1 << precision) - 1);
1226         }
1227 
1228         static int32_t Min(int32_t a, int32_t b)
1229         {
1230             if (a < b)
1231                 return a;
1232             return b;
1233         }
1234 
1235         static float Min(float a, float b)
1236         {
1237             if (a < b)
1238                 return a;
1239             return b;
1240         }
1241 
1242         static int32_t Max(int32_t a, int32_t b)
1243         {
1244             if (a > b)
1245                 return a;
1246             return b;
1247         }
1248 
1249         static float Max(float a, float b)
1250         {
1251             if (a > b)
1252                 return a;
1253             return b;
1254         }
1255 
1256         static float Abs(float a)
1257         {
1258             return fabsf(a);
1259         }
1260 
1261         static int32_t Abs(int32_t a)
1262         {
1263             if (a < 0)
1264                 return -a;
1265             return a;
1266         }
1267 
1268         static float Clamp(float v, float min, float max)
1269         {
1270             if (v < min)
1271                 return min;
1272             if (v > max)
1273                 return max;
1274             return v;
1275         }
1276 
1277         static float Reciprocal(float v)
1278         {
1279             return 1.0f / v;
1280         }
1281 
1282         static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut)
1283         {
1284             chOut = inputBlocks[0].m_pixels[pxOffset][channel];
1285         }
1286 
1287         static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut)
1288         {
1289             chOut = inputBlocks[0].m_pixels[pxOffset][channel];
1290         }
1291 
1292         static float MakeFloat(float v)
1293         {
1294             return v;
1295         }
1296 
1297         static float MakeFloatZero()
1298         {
1299             return 0.0f;
1300         }
1301 
1302         static int32_t MakeUInt16(uint16_t v)
1303         {
1304             return v;
1305         }
1306 
1307         static int32_t MakeSInt16(int16_t v)
1308         {
1309             return v;
1310         }
1311 
1312         static int32_t MakeAInt16(int16_t v)
1313         {
1314             return v;
1315         }
1316 
1317         static int32_t MakeUInt15(uint16_t v)
1318         {
1319             return v;
1320         }
1321 
1322         static int32_t MakeSInt32(int32_t v)
1323         {
1324             return v;
1325         }
1326 
1327         static int32_t MakeUInt31(int32_t v)
1328         {
1329             return v;
1330         }
1331 
1332         static int32_t Extract(int32_t v, int offset)
1333         {
1334             UNREFERENCED_PARAMETER(offset);
1335             return v;
1336         }
1337 
1338         static void PutUInt16(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
1339         {
1340             UNREFERENCED_PARAMETER(offset);
1341             dest = v;
1342         }
1343 
1344         static void PutUInt15(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
1345         {
1346             UNREFERENCED_PARAMETER(offset);
1347             dest = v;
1348         }
1349 
1350         static void PutSInt16(int32_t &dest, int offset, ParallelMath::ScalarSInt16 v)
1351         {
1352             UNREFERENCED_PARAMETER(offset);
1353             dest = v;
1354         }
1355 
1356         static float ExtractFloat(float v, int offset)
1357         {
1358             UNREFERENCED_PARAMETER(offset);
1359             return v;
1360         }
1361 
1362         static void PutFloat(float &dest, int offset, float v)
1363         {
1364             UNREFERENCED_PARAMETER(offset);
1365             dest = v;
1366         }
1367 
1368         static bool Less(int32_t a, int32_t b)
1369         {
1370             return a < b;
1371         }
1372 
1373         static bool Less(float a, float b)
1374         {
1375             return a < b;
1376         }
1377 
1378         static bool LessOrEqual(int32_t a, int32_t b)
1379         {
1380             return a < b;
1381         }
1382 
1383         static bool LessOrEqual(float a, float b)
1384         {
1385             return a < b;
1386         }
1387 
1388         static bool Equal(int32_t a, int32_t b)
1389         {
1390             return a == b;
1391         }
1392 
1393         static bool Equal(float a, float b)
1394         {
1395             return a == b;
1396         }
1397 
1398         static float ToFloat(int32_t v)
1399         {
1400             return static_cast<float>(v);
1401         }
1402 
1403         static int32_t ToUInt31(int32_t v)
1404         {
1405             return v;
1406         }
1407 
1408         static int32_t ToInt32(int32_t v)
1409         {
1410             return v;
1411         }
1412 
1413         static bool FloatFlagToInt16(bool v)
1414         {
1415             return v;
1416         }
1417 
1418         static bool Int16FlagToFloat(bool v)
1419         {
1420             return v;
1421         }
1422 
1423         static bool MakeBoolInt16(bool b)
1424         {
1425             return b;
1426         }
1427 
1428         static bool MakeBoolFloat(bool b)
1429         {
1430             return b;
1431         }
1432 
1433         static bool AndNot(bool a, bool b)
1434         {
1435             return a && !b;
1436         }
1437 
1438         static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardZeroForScope *rtz)
1439         {
1440             UNREFERENCED_PARAMETER(rtz);
1441             return static_cast<int>(v);
1442         }
1443 
1444         static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundUpForScope *ru)
1445         {
1446             UNREFERENCED_PARAMETER(ru);
1447             return static_cast<int>(ceilf(v));
1448         }
1449 
1450         static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundDownForScope *rd)
1451         {
1452             UNREFERENCED_PARAMETER(rd);
1453             return static_cast<int>(floorf(v));
1454         }
1455 
1456         static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardNearestForScope *rtn)
1457         {
1458             UNREFERENCED_PARAMETER(rtn);
1459             return static_cast<int>(floorf(v + 0.5f));
1460         }
1461 
1462         template<class TRoundMode>
1463         static int32_t RoundAndConvertToU16(float v, const TRoundMode *roundingMode)
1464         {
1465             return RoundAndConvertToInt(v, roundingMode);
1466         }
1467 
1468         template<class TRoundMode>
1469         static int32_t RoundAndConvertToU15(float v, const TRoundMode *roundingMode)
1470         {
1471             return RoundAndConvertToInt(v, roundingMode);
1472         }
1473 
1474         template<class TRoundMode>
1475         static int32_t RoundAndConvertToS16(float v, const TRoundMode *roundingMode)
1476         {
1477             return RoundAndConvertToInt(v, roundingMode);
1478         }
1479 
1480         static float Sqrt(float f)
1481         {
1482             return sqrtf(f);
1483         }
1484 
1485         static int32_t SqDiffUInt8(int32_t a, int32_t b)
1486         {
1487             int32_t delta = a - b;
1488             return delta * delta;
1489         }
1490 
1491         static int32_t SqDiffInt16(int32_t a, int32_t b)
1492         {
1493             int32_t delta = a - b;
1494             return delta * delta;
1495         }
1496 
1497         static int32_t SqDiffSInt16(int32_t a, int32_t b)
1498         {
1499             int32_t delta = a - b;
1500             return delta * delta;
1501         }
1502 
1503         static float TwosCLHalfToFloat(int32_t v)
1504         {
1505             int32_t absV = (v < 0) ? -v : v;
1506 
1507             int32_t signBits = (absV & -32768);
1508             int32_t mantissa = (absV & 0x03ff);
1509             int32_t exponent = (absV & 0x7c00);
1510 
1511             bool isDenormal = (exponent == 0);
1512 
1513             // Convert exponent to high-bits
1514             exponent = (exponent >> 3) + 14336;
1515 
1516             int32_t denormalCorrection = (isDenormal ? (signBits | 14336) : 0) << 16;
1517 
1518             int32_t fBits = ((exponent | signBits) << 16) | (mantissa << 13);
1519 
1520             float f, correction;
1521             memcpy(&f, &fBits, 4);
1522             memcpy(&correction, &denormalCorrection, 4);
1523 
1524             return f - correction;
1525         }
1526 
1527         static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
1528         {
1529             Float fa = TwosCLHalfToFloat(a);
1530 
1531             Float diff = fa - b;
1532             return diff * diff;
1533         }
1534 
1535         static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
1536         {
1537             Float fa = TwosCLHalfToFloat(a);
1538             Float fb = TwosCLHalfToFloat(b);
1539 
1540             Float diff = fa - fb;
1541             return diff * diff;
1542         }
1543 
1544         static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
1545         {
1546             Float fa = TwosCLHalfToFloat(a) * aWeight;
1547 
1548             Float diff = fa - b;
1549             return diff * diff;
1550         }
1551 
1552         static int32_t RightShift(int32_t v, int bits)
1553         {
1554             return SignedRightShift(v, bits);
1555         }
1556 
1557         static int32_t ToSInt16(int32_t v)
1558         {
1559             return v;
1560         }
1561 
1562         static int32_t ToUInt16(int32_t v)
1563         {
1564             return v;
1565         }
1566 
1567         static int32_t ToUInt15(int32_t v)
1568         {
1569             return v;
1570         }
1571 
1572         static int32_t XMultiply(int32_t a, int32_t b)
1573         {
1574             return a * b;
1575         }
1576 
1577         static int32_t CompactMultiply(int32_t a, int32_t b)
1578         {
1579             return a * b;
1580         }
1581 
1582         static bool AnySet(bool v)
1583         {
1584             return v;
1585         }
1586 
1587         static bool AllSet(bool v)
1588         {
1589             return v;
1590         }
1591     };
1592 
1593 #endif
1594 
1595     namespace Internal
1596     {
1597         namespace BC7Data
1598         {
1599             enum AlphaMode
1600             {
1601                 AlphaMode_Combined,
1602                 AlphaMode_Separate,
1603                 AlphaMode_None,
1604             };
1605 
1606             enum PBitMode
1607             {
1608                 PBitMode_PerEndpoint,
1609                 PBitMode_PerSubset,
1610                 PBitMode_None
1611             };
1612 
1613             struct BC7ModeInfo
1614             {
1615                 PBitMode m_pBitMode;
1616                 AlphaMode m_alphaMode;
1617                 int m_rgbBits;
1618                 int m_alphaBits;
1619                 int m_partitionBits;
1620                 int m_numSubsets;
1621                 int m_indexBits;
1622                 int m_alphaIndexBits;
1623                 bool m_hasIndexSelector;
1624             };
1625 
1626             BC7ModeInfo g_modes[] =
1627             {
1628                 { PBitMode_PerEndpoint, AlphaMode_None, 4, 0, 4, 3, 3, 0, false },     // 0
1629                 { PBitMode_PerSubset, AlphaMode_None, 6, 0, 6, 2, 3, 0, false },       // 1
1630                 { PBitMode_None, AlphaMode_None, 5, 0, 6, 3, 2, 0, false },            // 2
1631                 { PBitMode_PerEndpoint, AlphaMode_None, 7, 0, 6, 2, 2, 0, false },     // 3 (Mode reference has an error, P-bit is really per-endpoint)
1632 
1633                 { PBitMode_None, AlphaMode_Separate, 5, 6, 0, 1, 2, 3, true },         // 4
1634                 { PBitMode_None, AlphaMode_Separate, 7, 8, 0, 1, 2, 2, false },        // 5
1635                 { PBitMode_PerEndpoint, AlphaMode_Combined, 7, 7, 0, 1, 4, 0, false }, // 6
1636                 { PBitMode_PerEndpoint, AlphaMode_Combined, 5, 5, 6, 2, 2, 0, false }  // 7
1637             };
1638 
1639 			const int g_weight2[] = { 0, 21, 43, 64 };
1640 			const int g_weight3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
1641 			const int g_weight4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
1642 
1643 			const int *g_weightTables[] =
1644 			{
1645 				NULL,
1646 				NULL,
1647 				g_weight2,
1648 				g_weight3,
1649 				g_weight4
1650 			};
1651 
1652             struct BC6HModeInfo
1653             {
1654                 uint16_t m_modeID;
1655                 bool m_partitioned;
1656                 bool m_transformed;
1657                 int m_aPrec;
1658                 int m_bPrec[3];
1659             };
1660 
1661             // [partitioned][precision]
1662             bool g_hdrModesExistForPrecision[2][17] =
1663             {
1664                 //0      1      2      3      4      5      6      7      8      9      10     11     12     13     14     15     16
1665                 { false, false, false, false, false, false, false, false, false, false, true,  true,  true,  false, false, false, true },
1666                 { false, false, false, false, false, false, true,  true,  true,  true,  true,  true,  false, false, false, false, false },
1667             };
1668 
1669             BC6HModeInfo g_hdrModes[] =
1670             {
1671                 { 0x00, true,  true,  10,{ 5, 5, 5 } },
1672                 { 0x01, true,  true,  7,{ 6, 6, 6 } },
1673                 { 0x02, true,  true,  11,{ 5, 4, 4 } },
1674                 { 0x06, true,  true,  11,{ 4, 5, 4 } },
1675                 { 0x0a, true,  true,  11,{ 4, 4, 5 } },
1676                 { 0x0e, true,  true,  9,{ 5, 5, 5 } },
1677                 { 0x12, true,  true,  8,{ 6, 5, 5 } },
1678                 { 0x16, true,  true,  8,{ 5, 6, 5 } },
1679                 { 0x1a, true,  true,  8,{ 5, 5, 6 } },
1680                 { 0x1e, true,  false, 6,{ 6, 6, 6 } },
1681                 { 0x03, false, false, 10,{ 10, 10, 10 } },
1682                 { 0x07, false, true,  11,{ 9, 9, 9 } },
1683                 { 0x0b, false, true,  12,{ 8, 8, 8 } },
1684                 { 0x0f, false, true,  16,{ 4, 4, 4 } },
1685             };
1686 
1687             const int g_maxHDRPrecision = 16;
1688 
1689             static const size_t g_numHDRModes = sizeof(g_hdrModes) / sizeof(g_hdrModes[0]);
1690 
1691             static uint16_t g_partitionMap[64] =
1692             {
1693                 0xCCCC, 0x8888, 0xEEEE, 0xECC8,
1694                 0xC880, 0xFEEC, 0xFEC8, 0xEC80,
1695                 0xC800, 0xFFEC, 0xFE80, 0xE800,
1696                 0xFFE8, 0xFF00, 0xFFF0, 0xF000,
1697                 0xF710, 0x008E, 0x7100, 0x08CE,
1698                 0x008C, 0x7310, 0x3100, 0x8CCE,
1699                 0x088C, 0x3110, 0x6666, 0x366C,
1700                 0x17E8, 0x0FF0, 0x718E, 0x399C,
1701                 0xaaaa, 0xf0f0, 0x5a5a, 0x33cc,
1702                 0x3c3c, 0x55aa, 0x9696, 0xa55a,
1703                 0x73ce, 0x13c8, 0x324c, 0x3bdc,
1704                 0x6996, 0xc33c, 0x9966, 0x660,
1705                 0x272, 0x4e4, 0x4e40, 0x2720,
1706                 0xc936, 0x936c, 0x39c6, 0x639c,
1707                 0x9336, 0x9cc6, 0x817e, 0xe718,
1708                 0xccf0, 0xfcc, 0x7744, 0xee22,
1709             };
1710 
1711             static uint32_t g_partitionMap2[64] =
1712             {
1713                 0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8,
1714                 0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050,
1715                 0xaa550000, 0xaa555500, 0xaaaa5500, 0x90909090,
1716                 0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250,
1717                 0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0,
1718                 0xa8a85454, 0x6a6a4040, 0xa4a45000, 0x1a1a0500,
1719                 0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400,
1720                 0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200,
1721                 0xa9a58000, 0x5090a0a8, 0xa8a09050, 0x24242424,
1722                 0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50,
1723                 0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0,
1724                 0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600,
1725                 0xaa444444, 0x54a854a8, 0x95809580, 0x96969600,
1726                 0xa85454a8, 0x80959580, 0xaa141414, 0x96960000,
1727                 0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000,
1728                 0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254,
1729             };
1730 
1731             static int g_fixupIndexes2[64] =
1732             {
1733                 15,15,15,15,
1734                 15,15,15,15,
1735                 15,15,15,15,
1736                 15,15,15,15,
1737                 15, 2, 8, 2,
1738                 2, 8, 8,15,
1739                 2, 8, 2, 2,
1740                 8, 8, 2, 2,
1741 
1742                 15,15, 6, 8,
1743                 2, 8,15,15,
1744                 2, 8, 2, 2,
1745                 2,15,15, 6,
1746                 6, 2, 6, 8,
1747                 15,15, 2, 2,
1748                 15,15,15,15,
1749                 15, 2, 2,15,
1750             };
1751 
1752             static int g_fixupIndexes3[64][2] =
1753             {
1754                 { 3,15 },{ 3, 8 },{ 15, 8 },{ 15, 3 },
1755                 { 8,15 },{ 3,15 },{ 15, 3 },{ 15, 8 },
1756                 { 8,15 },{ 8,15 },{ 6,15 },{ 6,15 },
1757                 { 6,15 },{ 5,15 },{ 3,15 },{ 3, 8 },
1758                 { 3,15 },{ 3, 8 },{ 8,15 },{ 15, 3 },
1759                 { 3,15 },{ 3, 8 },{ 6,15 },{ 10, 8 },
1760                 { 5, 3 },{ 8,15 },{ 8, 6 },{ 6,10 },
1761                 { 8,15 },{ 5,15 },{ 15,10 },{ 15, 8 },
1762 
1763                 { 8,15 },{ 15, 3 },{ 3,15 },{ 5,10 },
1764                 { 6,10 },{ 10, 8 },{ 8, 9 },{ 15,10 },
1765                 { 15, 6 },{ 3,15 },{ 15, 8 },{ 5,15 },
1766                 { 15, 3 },{ 15, 6 },{ 15, 6 },{ 15, 8 },
1767                 { 3,15 },{ 15, 3 },{ 5,15 },{ 5,15 },
1768                 { 5,15 },{ 8,15 },{ 5,15 },{ 10,15 },
1769                 { 5,15 },{ 10,15 },{ 8,15 },{ 13,15 },
1770                 { 15, 3 },{ 12,15 },{ 3,15 },{ 3, 8 },
1771             };
1772 
1773             static const unsigned char g_fragments[] =
1774             {
1775                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 0, 16
1776                 0, 1, 2, 3,  // 16, 4
1777                 0, 1, 4,  // 20, 3
1778                 0, 1, 2, 4,  // 23, 4
1779                 2, 3, 7,  // 27, 3
1780                 1, 2, 3, 7,  // 30, 4
1781                 0, 1, 2, 3, 4, 5, 6, 7,  // 34, 8
1782                 0, 1, 4, 8,  // 42, 4
1783                 0, 1, 2, 4, 5, 8,  // 46, 6
1784                 0, 1, 2, 3, 4, 5, 6, 8,  // 52, 8
1785                 1, 4, 5, 6, 9,  // 60, 5
1786                 2, 5, 6, 7, 10,  // 65, 5
1787                 5, 6, 9, 10,  // 70, 4
1788                 2, 3, 7, 11,  // 74, 4
1789                 1, 2, 3, 6, 7, 11,  // 78, 6
1790                 0, 1, 2, 3, 5, 6, 7, 11,  // 84, 8
1791                 0, 1, 2, 3, 8, 9, 10, 11,  // 92, 8
1792                 2, 3, 6, 7, 8, 9, 10, 11,  // 100, 8
1793                 4, 5, 6, 7, 8, 9, 10, 11,  // 108, 8
1794                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,  // 116, 12
1795                 0, 4, 8, 12,  // 128, 4
1796                 0, 2, 3, 4, 6, 7, 8, 12,  // 132, 8
1797                 0, 1, 2, 4, 5, 8, 9, 12,  // 140, 8
1798                 0, 1, 2, 3, 4, 5, 6, 8, 9, 12,  // 148, 10
1799                 3, 6, 7, 8, 9, 12,  // 158, 6
1800                 3, 5, 6, 7, 8, 9, 10, 12,  // 164, 8
1801                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12,  // 172, 12
1802                 0, 1, 2, 5, 6, 7, 11, 12,  // 184, 8
1803                 5, 8, 9, 10, 13,  // 192, 5
1804                 8, 12, 13,  // 197, 3
1805                 4, 8, 12, 13,  // 200, 4
1806                 2, 3, 6, 9, 12, 13,  // 204, 6
1807                 0, 1, 2, 3, 8, 9, 12, 13,  // 210, 8
1808                 0, 1, 4, 5, 8, 9, 12, 13,  // 218, 8
1809                 2, 3, 6, 7, 8, 9, 12, 13,  // 226, 8
1810                 2, 3, 5, 6, 9, 10, 12, 13,  // 234, 8
1811                 0, 3, 6, 7, 9, 10, 12, 13,  // 242, 8
1812                 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13,  // 250, 12
1813                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13,  // 262, 13
1814                 2, 3, 4, 7, 8, 11, 12, 13,  // 275, 8
1815                 1, 2, 6, 7, 8, 11, 12, 13,  // 283, 8
1816                 2, 3, 4, 6, 7, 8, 9, 11, 12, 13,  // 291, 10
1817                 2, 3, 4, 5, 10, 11, 12, 13,  // 301, 8
1818                 0, 1, 6, 7, 10, 11, 12, 13,  // 309, 8
1819                 6, 9, 10, 11, 14,  // 317, 5
1820                 0, 2, 4, 6, 8, 10, 12, 14,  // 322, 8
1821                 1, 3, 5, 7, 8, 10, 12, 14,  // 330, 8
1822                 1, 3, 4, 6, 9, 11, 12, 14,  // 338, 8
1823                 0, 2, 5, 7, 9, 11, 12, 14,  // 346, 8
1824                 0, 3, 4, 5, 8, 9, 13, 14,  // 354, 8
1825                 2, 3, 4, 7, 8, 9, 13, 14,  // 362, 8
1826                 1, 2, 5, 6, 9, 10, 13, 14,  // 370, 8
1827                 0, 3, 4, 7, 9, 10, 13, 14,  // 378, 8
1828                 0, 3, 5, 6, 8, 11, 13, 14,  // 386, 8
1829                 1, 2, 4, 7, 8, 11, 13, 14,  // 394, 8
1830                 0, 1, 4, 7, 10, 11, 13, 14,  // 402, 8
1831                 0, 3, 6, 7, 10, 11, 13, 14,  // 410, 8
1832                 8, 12, 13, 14,  // 418, 4
1833                 1, 2, 3, 7, 8, 12, 13, 14,  // 422, 8
1834                 4, 8, 9, 12, 13, 14,  // 430, 6
1835                 0, 4, 5, 8, 9, 12, 13, 14,  // 436, 8
1836                 1, 2, 3, 6, 7, 8, 9, 12, 13, 14,  // 444, 10
1837                 2, 6, 8, 9, 10, 12, 13, 14,  // 454, 8
1838                 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,  // 462, 12
1839                 0, 7, 9, 10, 11, 12, 13, 14,  // 474, 8
1840                 1, 2, 3, 4, 5, 6, 8, 15,  // 482, 8
1841                 3, 7, 11, 15,  // 490, 4
1842                 0, 1, 3, 4, 5, 7, 11, 15,  // 494, 8
1843                 0, 4, 5, 10, 11, 15,  // 502, 6
1844                 1, 2, 3, 6, 7, 10, 11, 15,  // 508, 8
1845                 0, 1, 2, 3, 5, 6, 7, 10, 11, 15,  // 516, 10
1846                 0, 4, 5, 6, 9, 10, 11, 15,  // 526, 8
1847                 0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 15,  // 534, 12
1848                 1, 2, 4, 5, 8, 9, 12, 15,  // 546, 8
1849                 2, 3, 5, 6, 8, 9, 12, 15,  // 554, 8
1850                 0, 3, 5, 6, 9, 10, 12, 15,  // 562, 8
1851                 1, 2, 4, 7, 9, 10, 12, 15,  // 570, 8
1852                 1, 2, 5, 6, 8, 11, 12, 15,  // 578, 8
1853                 0, 3, 4, 7, 8, 11, 12, 15,  // 586, 8
1854                 0, 1, 5, 6, 10, 11, 12, 15,  // 594, 8
1855                 1, 2, 6, 7, 10, 11, 12, 15,  // 602, 8
1856                 1, 3, 4, 6, 8, 10, 13, 15,  // 610, 8
1857                 0, 2, 5, 7, 8, 10, 13, 15,  // 618, 8
1858                 0, 2, 4, 6, 9, 11, 13, 15,  // 626, 8
1859                 1, 3, 5, 7, 9, 11, 13, 15,  // 634, 8
1860                 0, 1, 2, 3, 4, 5, 7, 8, 12, 13, 15,  // 642, 11
1861                 2, 3, 4, 5, 8, 9, 14, 15,  // 653, 8
1862                 0, 1, 6, 7, 8, 9, 14, 15,  // 661, 8
1863                 0, 1, 5, 10, 14, 15,  // 669, 6
1864                 0, 3, 4, 5, 9, 10, 14, 15,  // 675, 8
1865                 0, 1, 5, 6, 9, 10, 14, 15,  // 683, 8
1866                 11, 14, 15,  // 691, 3
1867                 7, 11, 14, 15,  // 694, 4
1868                 1, 2, 4, 5, 8, 11, 14, 15,  // 698, 8
1869                 0, 1, 4, 7, 8, 11, 14, 15,  // 706, 8
1870                 0, 1, 4, 5, 10, 11, 14, 15,  // 714, 8
1871                 2, 3, 6, 7, 10, 11, 14, 15,  // 722, 8
1872                 4, 5, 6, 7, 10, 11, 14, 15,  // 730, 8
1873                 0, 1, 4, 5, 7, 8, 10, 11, 14, 15,  // 738, 10
1874                 0, 1, 2, 3, 5, 6, 7, 9, 10, 11, 14, 15,  // 748, 12
1875                 0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 14, 15,  // 760, 13
1876                 0, 1, 2, 3, 4, 6, 7, 11, 12, 14, 15,  // 773, 11
1877                 3, 4, 8, 9, 10, 13, 14, 15,  // 784, 8
1878                 11, 13, 14, 15,  // 792, 4
1879                 0, 1, 2, 4, 11, 13, 14, 15,  // 796, 8
1880                 0, 1, 2, 4, 5, 10, 11, 13, 14, 15,  // 804, 10
1881                 7, 10, 11, 13, 14, 15,  // 814, 6
1882                 3, 6, 7, 10, 11, 13, 14, 15,  // 820, 8
1883                 1, 5, 9, 10, 11, 13, 14, 15,  // 828, 8
1884                 1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15,  // 836, 12
1885                 12, 13, 14, 15,  // 848, 4
1886                 0, 1, 2, 3, 12, 13, 14, 15,  // 852, 8
1887                 0, 1, 4, 5, 12, 13, 14, 15,  // 860, 8
1888                 4, 5, 6, 7, 12, 13, 14, 15,  // 868, 8
1889                 4, 8, 9, 10, 12, 13, 14, 15,  // 876, 8
1890                 0, 4, 5, 8, 9, 10, 12, 13, 14, 15,  // 884, 10
1891                 0, 1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15,  // 894, 12
1892                 0, 1, 2, 3, 4, 7, 8, 11, 12, 13, 14, 15,  // 906, 12
1893                 0, 1, 3, 4, 8, 9, 11, 12, 13, 14, 15,  // 918, 11
1894                 0, 2, 3, 7, 8, 10, 11, 12, 13, 14, 15,  // 929, 11
1895                 7, 9, 10, 11, 12, 13, 14, 15,  // 940, 8
1896                 3, 6, 7, 9, 10, 11, 12, 13, 14, 15,  // 948, 10
1897                 2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15,  // 958, 12
1898                 8, 9, 10, 11, 12, 13, 14, 15,  // 970, 8
1899                 0, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15,  // 978, 12
1900                 0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15,  // 990, 13
1901                 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1003, 12
1902                 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1015, 13
1903                 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1028, 12
1904                 0, 2,  // 1040, 2
1905                 1, 3,  // 1042, 2
1906                 0, 1, 4, 5,  // 1044, 4
1907                 0, 1, 2, 4, 5,  // 1048, 5
1908                 2, 3, 6,  // 1053, 3
1909                 0, 2, 4, 6,  // 1056, 4
1910                 1, 2, 5, 6,  // 1060, 4
1911                 0, 1, 2, 3, 5, 6,  // 1064, 6
1912                 0, 1, 2, 4, 5, 6,  // 1070, 6
1913                 0, 1, 2, 3, 4, 5, 6,  // 1076, 7
1914                 0, 3, 4, 7,  // 1083, 4
1915                 0, 1, 2, 3, 4, 7,  // 1087, 6
1916                 1, 3, 5, 7,  // 1093, 4
1917                 2, 3, 6, 7,  // 1097, 4
1918                 1, 2, 3, 6, 7,  // 1101, 5
1919                 1, 2, 3, 5, 6, 7,  // 1106, 6
1920                 0, 1, 2, 3, 5, 6, 7,  // 1112, 7
1921                 4, 5, 6, 7,  // 1119, 4
1922                 0, 8,  // 1123, 2
1923                 0, 1, 4, 5, 8,  // 1125, 5
1924                 0, 1, 8, 9,  // 1130, 4
1925                 4, 5, 8, 9,  // 1134, 4
1926                 0, 1, 4, 5, 8, 9,  // 1138, 6
1927                 2, 6, 8, 9,  // 1144, 4
1928                 6, 7, 8, 9,  // 1148, 4
1929                 0, 2, 4, 6, 8, 10,  // 1152, 6
1930                 1, 2, 5, 6, 9, 10,  // 1158, 6
1931                 0, 3, 4, 7, 9, 10,  // 1164, 6
1932                 0, 1, 2, 8, 9, 10,  // 1170, 6
1933                 4, 5, 6, 8, 9, 10,  // 1176, 6
1934                 3, 11,  // 1182, 2
1935                 2, 3, 6, 7, 11,  // 1184, 5
1936                 0, 3, 8, 11,  // 1189, 4
1937                 0, 3, 4, 7, 8, 11,  // 1193, 6
1938                 1, 3, 5, 7, 9, 11,  // 1199, 6
1939                 2, 3, 10, 11,  // 1205, 4
1940                 1, 5, 10, 11,  // 1209, 4
1941                 4, 5, 10, 11,  // 1213, 4
1942                 6, 7, 10, 11,  // 1217, 4
1943                 2, 3, 6, 7, 10, 11,  // 1221, 6
1944                 1, 2, 3, 9, 10, 11,  // 1227, 6
1945                 5, 6, 7, 9, 10, 11,  // 1233, 6
1946                 8, 9, 10, 11,  // 1239, 4
1947                 4, 12,  // 1243, 2
1948                 0, 1, 2, 3, 4, 5, 8, 12,  // 1245, 8
1949                 8, 9, 12,  // 1253, 3
1950                 0, 4, 5, 8, 9, 12,  // 1256, 6
1951                 0, 1, 4, 5, 8, 9, 12,  // 1262, 7
1952                 2, 3, 5, 6, 8, 9, 12,  // 1269, 7
1953                 1, 5, 9, 13,  // 1276, 4
1954                 6, 7, 9, 13,  // 1280, 4
1955                 1, 4, 7, 10, 13,  // 1284, 5
1956                 1, 6, 8, 11, 13,  // 1289, 5
1957                 0, 1, 12, 13,  // 1294, 4
1958                 4, 5, 12, 13,  // 1298, 4
1959                 0, 1, 6, 7, 12, 13,  // 1302, 6
1960                 0, 1, 4, 8, 12, 13,  // 1308, 6
1961                 8, 9, 12, 13,  // 1314, 4
1962                 4, 8, 9, 12, 13,  // 1318, 5
1963                 4, 5, 8, 9, 12, 13,  // 1323, 6
1964                 0, 4, 5, 8, 9, 12, 13,  // 1329, 7
1965                 0, 1, 6, 10, 12, 13,  // 1336, 6
1966                 3, 6, 7, 9, 10, 12, 13,  // 1342, 7
1967                 0, 1, 10, 11, 12, 13,  // 1349, 6
1968                 2, 4, 7, 9, 14,  // 1355, 5
1969                 4, 5, 10, 14,  // 1360, 4
1970                 2, 6, 10, 14,  // 1364, 4
1971                 2, 5, 8, 11, 14,  // 1368, 5
1972                 0, 2, 12, 14,  // 1373, 4
1973                 8, 10, 12, 14,  // 1377, 4
1974                 4, 6, 8, 10, 12, 14,  // 1381, 6
1975                 13, 14,  // 1387, 2
1976                 9, 10, 13, 14,  // 1389, 4
1977                 5, 6, 9, 10, 13, 14,  // 1393, 6
1978                 0, 1, 2, 12, 13, 14,  // 1399, 6
1979                 4, 5, 6, 12, 13, 14,  // 1405, 6
1980                 8, 9, 12, 13, 14,  // 1411, 5
1981                 8, 9, 10, 12, 13, 14,  // 1416, 6
1982                 7, 15,  // 1422, 2
1983                 0, 5, 10, 15,  // 1424, 4
1984                 0, 1, 2, 3, 6, 7, 11, 15,  // 1428, 8
1985                 10, 11, 15,  // 1436, 3
1986                 0, 1, 5, 6, 10, 11, 15,  // 1439, 7
1987                 3, 6, 7, 10, 11, 15,  // 1446, 6
1988                 12, 15,  // 1452, 2
1989                 0, 3, 12, 15,  // 1454, 4
1990                 4, 7, 12, 15,  // 1458, 4
1991                 0, 3, 6, 9, 12, 15,  // 1462, 6
1992                 0, 3, 5, 10, 12, 15,  // 1468, 6
1993                 8, 11, 12, 15,  // 1474, 4
1994                 5, 6, 8, 11, 12, 15,  // 1478, 6
1995                 4, 7, 8, 11, 12, 15,  // 1484, 6
1996                 1, 3, 13, 15,  // 1490, 4
1997                 9, 11, 13, 15,  // 1494, 4
1998                 5, 7, 9, 11, 13, 15,  // 1498, 6
1999                 2, 3, 14, 15,  // 1504, 4
2000                 2, 3, 4, 5, 14, 15,  // 1508, 6
2001                 6, 7, 14, 15,  // 1514, 4
2002                 2, 3, 5, 9, 14, 15,  // 1518, 6
2003                 2, 3, 8, 9, 14, 15,  // 1524, 6
2004                 10, 14, 15,  // 1530, 3
2005                 0, 4, 5, 9, 10, 14, 15,  // 1533, 7
2006                 2, 3, 7, 11, 14, 15,  // 1540, 6
2007                 10, 11, 14, 15,  // 1546, 4
2008                 7, 10, 11, 14, 15,  // 1550, 5
2009                 6, 7, 10, 11, 14, 15,  // 1555, 6
2010                 1, 2, 3, 13, 14, 15,  // 1561, 6
2011                 5, 6, 7, 13, 14, 15,  // 1567, 6
2012                 10, 11, 13, 14, 15,  // 1573, 5
2013                 9, 10, 11, 13, 14, 15,  // 1578, 6
2014                 0, 4, 8, 9, 12, 13, 14, 15,  // 1584, 8
2015                 9, 10, 12, 13, 14, 15,  // 1592, 6
2016                 8, 11, 12, 13, 14, 15,  // 1598, 6
2017                 3, 7, 10, 11, 12, 13, 14, 15,  // 1604, 8
2018             };
2019             static const int g_shapeRanges[][2] =
2020             {
2021                 { 0, 16 },{ 16, 4 },{ 20, 3 },{ 23, 4 },{ 27, 3 },{ 30, 4 },{ 34, 8 },{ 42, 4 },{ 46, 6 },{ 52, 8 },{ 60, 5 },
2022                 { 65, 5 },{ 70, 4 },{ 74, 4 },{ 78, 6 },{ 84, 8 },{ 92, 8 },{ 100, 8 },{ 108, 8 },{ 116, 12 },{ 128, 4 },{ 132, 8 },
2023                 { 140, 8 },{ 148, 10 },{ 158, 6 },{ 164, 8 },{ 172, 12 },{ 184, 8 },{ 192, 5 },{ 197, 3 },{ 200, 4 },{ 204, 6 },{ 210, 8 },
2024                 { 218, 8 },{ 226, 8 },{ 234, 8 },{ 242, 8 },{ 250, 12 },{ 262, 13 },{ 275, 8 },{ 283, 8 },{ 291, 10 },{ 301, 8 },{ 309, 8 },
2025                 { 317, 5 },{ 322, 8 },{ 330, 8 },{ 338, 8 },{ 346, 8 },{ 354, 8 },{ 362, 8 },{ 370, 8 },{ 378, 8 },{ 386, 8 },{ 394, 8 },
2026                 { 402, 8 },{ 410, 8 },{ 418, 4 },{ 422, 8 },{ 430, 6 },{ 436, 8 },{ 444, 10 },{ 454, 8 },{ 462, 12 },{ 474, 8 },{ 482, 8 },
2027                 { 490, 4 },{ 494, 8 },{ 502, 6 },{ 508, 8 },{ 516, 10 },{ 526, 8 },{ 534, 12 },{ 546, 8 },{ 554, 8 },{ 562, 8 },{ 570, 8 },
2028                 { 578, 8 },{ 586, 8 },{ 594, 8 },{ 602, 8 },{ 610, 8 },{ 618, 8 },{ 626, 8 },{ 634, 8 },{ 642, 11 },{ 653, 8 },{ 661, 8 },
2029                 { 669, 6 },{ 675, 8 },{ 683, 8 },{ 691, 3 },{ 694, 4 },{ 698, 8 },{ 706, 8 },{ 714, 8 },{ 722, 8 },{ 730, 8 },{ 738, 10 },
2030                 { 748, 12 },{ 760, 13 },{ 773, 11 },{ 784, 8 },{ 792, 4 },{ 796, 8 },{ 804, 10 },{ 814, 6 },{ 820, 8 },{ 828, 8 },{ 836, 12 },
2031                 { 848, 4 },{ 852, 8 },{ 860, 8 },{ 868, 8 },{ 876, 8 },{ 884, 10 },{ 894, 12 },{ 906, 12 },{ 918, 11 },{ 929, 11 },{ 940, 8 },
2032                 { 948, 10 },{ 958, 12 },{ 970, 8 },{ 978, 12 },{ 990, 13 },{ 1003, 12 },{ 1015, 13 },{ 1028, 12 },{ 1040, 2 },{ 1042, 2 },{ 1044, 4 },
2033                 { 1048, 5 },{ 1053, 3 },{ 1056, 4 },{ 1060, 4 },{ 1064, 6 },{ 1070, 6 },{ 1076, 7 },{ 1083, 4 },{ 1087, 6 },{ 1093, 4 },{ 1097, 4 },
2034                 { 1101, 5 },{ 1106, 6 },{ 1112, 7 },{ 1119, 4 },{ 1123, 2 },{ 1125, 5 },{ 1130, 4 },{ 1134, 4 },{ 1138, 6 },{ 1144, 4 },{ 1148, 4 },
2035                 { 1152, 6 },{ 1158, 6 },{ 1164, 6 },{ 1170, 6 },{ 1176, 6 },{ 1182, 2 },{ 1184, 5 },{ 1189, 4 },{ 1193, 6 },{ 1199, 6 },{ 1205, 4 },
2036                 { 1209, 4 },{ 1213, 4 },{ 1217, 4 },{ 1221, 6 },{ 1227, 6 },{ 1233, 6 },{ 1239, 4 },{ 1243, 2 },{ 1245, 8 },{ 1253, 3 },{ 1256, 6 },
2037                 { 1262, 7 },{ 1269, 7 },{ 1276, 4 },{ 1280, 4 },{ 1284, 5 },{ 1289, 5 },{ 1294, 4 },{ 1298, 4 },{ 1302, 6 },{ 1308, 6 },{ 1314, 4 },
2038                 { 1318, 5 },{ 1323, 6 },{ 1329, 7 },{ 1336, 6 },{ 1342, 7 },{ 1349, 6 },{ 1355, 5 },{ 1360, 4 },{ 1364, 4 },{ 1368, 5 },{ 1373, 4 },
2039                 { 1377, 4 },{ 1381, 6 },{ 1387, 2 },{ 1389, 4 },{ 1393, 6 },{ 1399, 6 },{ 1405, 6 },{ 1411, 5 },{ 1416, 6 },{ 1422, 2 },{ 1424, 4 },
2040                 { 1428, 8 },{ 1436, 3 },{ 1439, 7 },{ 1446, 6 },{ 1452, 2 },{ 1454, 4 },{ 1458, 4 },{ 1462, 6 },{ 1468, 6 },{ 1474, 4 },{ 1478, 6 },
2041                 { 1484, 6 },{ 1490, 4 },{ 1494, 4 },{ 1498, 6 },{ 1504, 4 },{ 1508, 6 },{ 1514, 4 },{ 1518, 6 },{ 1524, 6 },{ 1530, 3 },{ 1533, 7 },
2042                 { 1540, 6 },{ 1546, 4 },{ 1550, 5 },{ 1555, 6 },{ 1561, 6 },{ 1567, 6 },{ 1573, 5 },{ 1578, 6 },{ 1584, 8 },{ 1592, 6 },{ 1598, 6 },
2043                 { 1604, 8 },
2044             };
2045             static const int g_shapes1[][2] =
2046             {
2047                 { 0, 16 }
2048             };
2049             static const int g_shapes2[64][2] =
2050             {
2051                 { 33, 96 },{ 63, 66 },{ 20, 109 },{ 22, 107 },{ 37, 92 },{ 7, 122 },{ 8, 121 },{ 23, 106 },
2052                 { 38, 91 },{ 2, 127 },{ 9, 120 },{ 26, 103 },{ 3, 126 },{ 6, 123 },{ 1, 128 },{ 19, 110 },
2053                 { 15, 114 },{ 124, 5 },{ 72, 57 },{ 115, 14 },{ 125, 4 },{ 70, 59 },{ 100, 29 },{ 60, 69 },
2054                 { 116, 13 },{ 99, 30 },{ 78, 51 },{ 94, 35 },{ 104, 25 },{ 111, 18 },{ 71, 58 },{ 90, 39 },
2055                 { 45, 84 },{ 16, 113 },{ 82, 47 },{ 95, 34 },{ 87, 42 },{ 83, 46 },{ 53, 76 },{ 48, 81 },
2056                 { 68, 61 },{ 105, 24 },{ 98, 31 },{ 88, 41 },{ 75, 54 },{ 43, 86 },{ 52, 77 },{ 117, 12 },
2057                 { 119, 10 },{ 118, 11 },{ 85, 44 },{ 101, 28 },{ 36, 93 },{ 55, 74 },{ 89, 40 },{ 79, 50 },
2058                 { 56, 73 },{ 49, 80 },{ 64, 65 },{ 27, 102 },{ 32, 97 },{ 112, 17 },{ 67, 62 },{ 21, 108 },
2059             };
2060             static const int g_shapes3[64][3] =
2061             {
2062                 { 148, 160, 240 },{ 132, 212, 205 },{ 136, 233, 187 },{ 175, 237, 143 },{ 6, 186, 232 },{ 33, 142, 232 },{ 131, 123, 142 },{ 131, 96, 186 },
2063                 { 6, 171, 110 },{ 1, 18, 110 },{ 1, 146, 123 },{ 33, 195, 66 },{ 20, 51, 66 },{ 20, 178, 96 },{ 2, 177, 106 },{ 211, 4, 59 },
2064                 { 8, 191, 91 },{ 230, 14, 29 },{ 1, 188, 234 },{ 151, 110, 168 },{ 20, 144, 238 },{ 137, 66, 206 },{ 173, 179, 232 },{ 209, 194, 186 },
2065                 { 239, 165, 142 },{ 131, 152, 242 },{ 214, 54, 12 },{ 140, 219, 201 },{ 190, 150, 231 },{ 156, 135, 241 },{ 185, 227, 167 },{ 145, 210, 59 },
2066                 { 138, 174, 106 },{ 189, 229, 14 },{ 176, 133, 106 },{ 78, 178, 195 },{ 111, 146, 171 },{ 216, 180, 196 },{ 217, 181, 193 },{ 184, 228, 166 },
2067                 { 192, 225, 153 },{ 134, 141, 123 },{ 6, 222, 198 },{ 149, 183, 96 },{ 33, 226, 164 },{ 161, 215, 51 },{ 197, 221, 18 },{ 1, 223, 199 },
2068                 { 154, 163, 110 },{ 20, 236, 169 },{ 157, 204, 66 },{ 1, 202, 220 },{ 20, 170, 235 },{ 203, 158, 66 },{ 162, 155, 110 },{ 6, 201, 218 },
2069                 { 139, 135, 123 },{ 33, 167, 224 },{ 182, 150, 96 },{ 19, 200, 213 },{ 63, 207, 159 },{ 147, 172, 109 },{ 129, 130, 128 },{ 208, 14, 59 },
2070             };
2071 
2072             static const int g_shapeList1[] =
2073             {
2074                 0,
2075             };
2076 
2077             static const int g_shapeList1Collapse[] =
2078             {
2079                 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2080                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2081                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2082                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2083                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2084                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2085                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2086                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2087                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2088                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2089                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2090                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2091                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2092                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2093                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2094                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2095                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2096                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2097                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2098                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2099                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2100                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2101                 -1,
2102             };
2103             static const int g_shapeList2[] =
2104             {
2105                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
2106                 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
2107                 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
2108                 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
2109                 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
2110                 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
2111                 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
2112                 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
2113                 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
2114                 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
2115                 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
2116                 122, 123, 124, 125, 126, 127, 128,
2117             };
2118             static const int g_shapeList2Collapse[] =
2119             {
2120                 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
2121                 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
2122                 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2123                 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
2124                 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
2125                 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
2126                 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
2127                 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
2128                 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97,
2129                 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
2130                 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
2131                 120, 121, 122, 123, 124, 125, 126, 127, -1, -1, -1,
2132                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2133                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2134                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2135                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2136                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2137                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2138                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2139                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2140                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2141                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2142                 -1,
2143             };
2144 
2145             static const int g_shapeList12[] =
2146             {
2147                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
2148                 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
2149                 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
2150                 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
2151                 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
2152                 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
2153                 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
2154                 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
2155                 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
2156                 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
2157                 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
2158                 121, 122, 123, 124, 125, 126, 127, 128,
2159             };
2160 
2161             static const int g_shapeList12Collapse[] =
2162             {
2163                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
2164                 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
2165                 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
2166                 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
2167                 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
2168                 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
2169                 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
2170                 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
2171                 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
2172                 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
2173                 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
2174                 121, 122, 123, 124, 125, 126, 127, 128, -1, -1, -1,
2175                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2176                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2177                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2178                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2179                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2180                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2181                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2182                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2183                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2184                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2185                 -1,
2186             };
2187 
2188             static const int g_shapeList3[] =
2189             {
2190                 1, 2, 4, 6, 8, 12, 14, 18, 19, 20, 29,
2191                 33, 51, 54, 59, 63, 66, 78, 91, 96, 106, 109,
2192                 110, 111, 123, 128, 129, 130, 131, 132, 133, 134, 135,
2193                 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
2194                 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
2195                 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
2196                 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
2197                 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
2198                 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201,
2199                 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
2200                 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
2201                 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
2202                 235, 236, 237, 238, 239, 240, 241, 242,
2203             };
2204 
2205             static const int g_shapeList3Collapse[] =
2206             {
2207                 -1, 0, 1, -1, 2, -1, 3, -1, 4, -1, -1,
2208                 -1, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1,
2209                 -1, -1, -1, -1, -1, -1, -1, 10, -1, -1, -1,
2210                 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2211                 -1, -1, -1, -1, -1, -1, -1, 12, -1, -1, 13,
2212                 -1, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1,
2213                 16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2214                 -1, 17, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2215                 -1, -1, -1, 18, -1, -1, -1, -1, 19, -1, -1,
2216                 -1, -1, -1, -1, -1, -1, -1, 20, -1, -1, 21,
2217                 22, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2218                 -1, -1, 24, -1, -1, -1, -1, 25, 26, 27, 28,
2219                 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
2220                 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
2221                 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
2222                 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
2223                 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
2224                 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
2225                 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
2226                 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
2227                 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
2228                 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
2229                 139,
2230             };
2231 
2232             static const int g_shapeList3Short[] =
2233             {
2234                 1, 2, 4, 6, 18, 20, 33, 51, 59, 66, 96,
2235                 106, 110, 123, 131, 132, 136, 142, 143, 146, 148, 160,
2236                 171, 175, 177, 178, 186, 187, 195, 205, 211, 212, 232,
2237                 233, 237, 240,
2238             };
2239 
2240             static const int g_shapeList3ShortCollapse[] =
2241             {
2242                 -1, 0, 1, -1, 2, -1, 3, -1, -1, -1, -1,
2243                 -1, -1, -1, -1, -1, -1, -1, 4, -1, 5, -1,
2244                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2245                 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2246                 -1, -1, -1, -1, -1, -1, -1, 7, -1, -1, -1,
2247                 -1, -1, -1, -1, 8, -1, -1, -1, -1, -1, -1,
2248                 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2249                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2250                 -1, -1, -1, -1, -1, -1, -1, -1, 10, -1, -1,
2251                 -1, -1, -1, -1, -1, -1, -1, 11, -1, -1, -1,
2252                 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2253                 -1, -1, 13, -1, -1, -1, -1, -1, -1, -1, 14,
2254                 15, -1, -1, -1, 16, -1, -1, -1, -1, -1, 17,
2255                 18, -1, -1, 19, -1, 20, -1, -1, -1, -1, -1,
2256                 -1, -1, -1, -1, -1, -1, 21, -1, -1, -1, -1,
2257                 -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, 23,
2258                 -1, 24, 25, -1, -1, -1, -1, -1, -1, -1, 26,
2259                 27, -1, -1, -1, -1, -1, -1, -1, 28, -1, -1,
2260                 -1, -1, -1, -1, -1, -1, -1, 29, -1, -1, -1,
2261                 -1, -1, 30, 31, -1, -1, -1, -1, -1, -1, -1,
2262                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2263                 -1, 32, 33, -1, -1, -1, 34, -1, -1, 35, -1,
2264                 -1,
2265             };
2266 
2267             static const int g_shapeListAll[] =
2268             {
2269                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
2270                 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
2271                 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
2272                 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
2273                 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
2274                 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
2275                 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
2276                 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
2277                 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
2278                 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
2279                 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
2280                 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
2281                 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
2282                 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
2283                 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
2284                 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
2285                 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
2286                 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
2287                 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
2288                 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
2289                 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230,
2290                 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
2291                 242,
2292             };
2293 
2294             static const int g_numShapes1 = sizeof(g_shapeList1) / sizeof(g_shapeList1[0]);
2295             static const int g_numShapes2 = sizeof(g_shapeList2) / sizeof(g_shapeList2[0]);
2296             static const int g_numShapes12 = sizeof(g_shapeList12) / sizeof(g_shapeList12[0]);
2297             static const int g_numShapes3 = sizeof(g_shapeList3) / sizeof(g_shapeList3[0]);
2298             static const int g_numShapes3Short = sizeof(g_shapeList3Short) / sizeof(g_shapeList3Short[0]);
2299             static const int g_numShapesAll = sizeof(g_shapeListAll) / sizeof(g_shapeListAll[0]);
2300             static const int g_numFragments = sizeof(g_fragments) / sizeof(g_fragments[0]);
2301 
2302             static const int g_maxFragmentsPerMode = (g_numShapes2 > g_numShapes3) ? g_numShapes2 : g_numShapes3;
2303         }
2304 
2305         namespace BC6HData
2306         {
2307             enum EField
2308             {
2309                 NA, // N/A
2310                 M,  // Mode
2311                 D,  // Shape
2312                 RW,
2313                 RX,
2314                 RY,
2315                 RZ,
2316                 GW,
2317                 GX,
2318                 GY,
2319                 GZ,
2320                 BW,
2321                 BX,
2322                 BY,
2323                 BZ,
2324             };
2325 
2326             struct ModeDescriptor
2327             {
2328                 EField m_eField;
2329                 uint8_t   m_uBit;
2330             };
2331 
2332             const ModeDescriptor g_modeDescriptors[14][82] =
2333             {
2334                 {   // Mode 1 (0x00) - 10 5 5 5
2335                     { M, 0 },{ M, 1 },{ GY, 4 },{ BY, 4 },{ BZ, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
2336                     { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
2337                     { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
2338                     { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
2339                     { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
2340                     { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
2341                     { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
2342                     { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
2343                     { D, 3 },{ D, 4 },
2344                 },
2345 
2346                 {   // Mode 2 (0x01) - 7 6 6 6
2347                     { M, 0 },{ M, 1 },{ GY, 5 },{ GZ, 4 },{ GZ, 5 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
2348                     { RW, 5 },{ RW, 6 },{ BZ, 0 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
2349                     { GW, 5 },{ GW, 6 },{ BY, 5 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
2350                     { BW, 5 },{ BW, 6 },{ BZ, 3 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
2351                     { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
2352                     { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
2353                     { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
2354                     { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
2355                     { D, 3 },{ D, 4 },
2356                 },
2357 
2358                 {   // Mode 3 (0x02) - 11 5 4 4
2359                     { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
2360                     { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
2361                     { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
2362                     { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
2363                     { RW,10 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,10 },
2364                     { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,10 },
2365                     { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
2366                     { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
2367                     { D, 3 },{ D, 4 },
2368                 },
2369 
2370                 {   // Mode 4 (0x06) - 11 4 5 4
2371                     { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
2372                     { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
2373                     { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
2374                     { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,10 },
2375                     { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
2376                     { GW,10 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,10 },
2377                     { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ BZ, 0 },
2378                     { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ GY, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
2379                     { D, 3 },{ D, 4 },
2380                 },
2381 
2382                 {   // Mode 5 (0x0a) - 11 4 4 5
2383                     { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
2384                     { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
2385                     { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
2386                     { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,10 },
2387                     { BY, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,10 },
2388                     { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
2389                     { BW,10 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ BZ, 1 },
2390                     { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ BZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
2391                     { D, 3 },{ D, 4 },
2392                 },
2393 
2394                 {   // Mode 6 (0x0e) - 9 5 5 5
2395                     { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
2396                     { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
2397                     { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
2398                     { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
2399                     { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
2400                     { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
2401                     { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
2402                     { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
2403                     { D, 3 },{ D, 4 },
2404                 },
2405 
2406                 {   // Mode 7 (0x12) - 8 6 5 5
2407                     { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
2408                     { RW, 5 },{ RW, 6 },{ RW, 7 },{ GZ, 4 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
2409                     { GW, 5 },{ GW, 6 },{ GW, 7 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
2410                     { BW, 5 },{ BW, 6 },{ BW, 7 },{ BZ, 3 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
2411                     { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
2412                     { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
2413                     { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
2414                     { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
2415                     { D, 3 },{ D, 4 },
2416                 },
2417 
2418                 {   // Mode 8 (0x16) - 8 5 6 5
2419                     { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
2420                     { RW, 5 },{ RW, 6 },{ RW, 7 },{ BZ, 0 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
2421                     { GW, 5 },{ GW, 6 },{ GW, 7 },{ GY, 5 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
2422                     { BW, 5 },{ BW, 6 },{ BW, 7 },{ GZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
2423                     { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
2424                     { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
2425                     { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
2426                     { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
2427                     { D, 3 },{ D, 4 },
2428                 },
2429 
2430                 {   // Mode 9 (0x1a) - 8 5 5 6
2431                     { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
2432                     { RW, 5 },{ RW, 6 },{ RW, 7 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
2433                     { GW, 5 },{ GW, 6 },{ GW, 7 },{ BY, 5 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
2434                     { BW, 5 },{ BW, 6 },{ BW, 7 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
2435                     { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
2436                     { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
2437                     { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
2438                     { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
2439                     { D, 3 },{ D, 4 },
2440                 },
2441 
2442                 {   // Mode 10 (0x1e) - 6 6 6 6
2443                     { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
2444                     { RW, 5 },{ GZ, 4 },{ BZ, 0 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
2445                     { GW, 5 },{ GY, 5 },{ BY, 5 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
2446                     { BW, 5 },{ GZ, 5 },{ BZ, 3 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
2447                     { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
2448                     { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
2449                     { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
2450                     { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
2451                     { D, 3 },{ D, 4 },
2452                 },
2453 
2454                 {   // Mode 11 (0x03) - 10 10
2455                     { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
2456                     { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
2457                     { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
2458                     { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
2459                     { RX, 5 },{ RX, 6 },{ RX, 7 },{ RX, 8 },{ RX, 9 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
2460                     { GX, 5 },{ GX, 6 },{ GX, 7 },{ GX, 8 },{ GX, 9 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
2461                     { BX, 5 },{ BX, 6 },{ BX, 7 },{ BX, 8 },{ BX, 9 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
2462                     { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
2463                     { NA, 0 },{ NA, 0 },
2464                 },
2465 
2466                 {   // Mode 12 (0x07) - 11 9
2467                     { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
2468                     { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
2469                     { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
2470                     { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
2471                     { RX, 5 },{ RX, 6 },{ RX, 7 },{ RX, 8 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
2472                     { GX, 5 },{ GX, 6 },{ GX, 7 },{ GX, 8 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
2473                     { BX, 5 },{ BX, 6 },{ BX, 7 },{ BX, 8 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
2474                     { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
2475                     { NA, 0 },{ NA, 0 },
2476                 },
2477 
2478                 {   // Mode 13 (0x0b) - 12 8
2479                     { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
2480                     { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
2481                     { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
2482                     { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
2483                     { RX, 5 },{ RX, 6 },{ RX, 7 },{ RW,11 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
2484                     { GX, 5 },{ GX, 6 },{ GX, 7 },{ GW,11 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
2485                     { BX, 5 },{ BX, 6 },{ BX, 7 },{ BW,11 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
2486                     { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
2487                     { NA, 0 },{ NA, 0 },
2488                 },
2489 
2490                 {   // Mode 14 (0x0f) - 16 4
2491                     { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
2492                     { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
2493                     { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
2494                     { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,15 },
2495                     { RW,14 },{ RW,13 },{ RW,12 },{ RW,11 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,15 },
2496                     { GW,14 },{ GW,13 },{ GW,12 },{ GW,11 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,15 },
2497                     { BW,14 },{ BW,13 },{ BW,12 },{ BW,11 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
2498                     { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
2499                     { NA, 0 },{ NA, 0 },
2500                 },
2501             };
2502         }
2503 
2504         struct PackingVector
2505         {
2506             uint32_t m_vector[4];
2507             int m_offset;
2508 
Initcvtt::Internal::PackingVector2509             void Init()
2510             {
2511                 for (int i = 0; i < 4; i++)
2512                     m_vector[i] = 0;
2513 
2514                 m_offset = 0;
2515             }
2516 
Packcvtt::Internal::PackingVector2517             inline void Pack(ParallelMath::ScalarUInt16 value, int bits)
2518             {
2519                 int vOffset = m_offset >> 5;
2520                 int bitOffset = m_offset & 0x1f;
2521 
2522                 m_vector[vOffset] |= (static_cast<uint32_t>(value) << bitOffset) & static_cast<uint32_t>(0xffffffff);
2523 
2524                 int overflowBits = bitOffset + bits - 32;
2525                 if (overflowBits > 0)
2526                     m_vector[vOffset + 1] |= (static_cast<uint32_t>(value) >> (bits - overflowBits));
2527 
2528                 m_offset += bits;
2529             }
2530 
Flushcvtt::Internal::PackingVector2531             inline void Flush(uint8_t* output)
2532             {
2533                 assert(m_offset == 128);
2534 
2535                 for (int v = 0; v < 4; v++)
2536                 {
2537                     uint32_t chunk = m_vector[v];
2538                     for (int b = 0; b < 4; b++)
2539                         output[v * 4 + b] = static_cast<uint8_t>((chunk >> (b * 8)) & 0xff);
2540                 }
2541             }
2542         };
2543 
2544 
2545 		struct UnpackingVector
2546 		{
2547 			uint32_t m_vector[4];
2548 
Initcvtt::Internal::UnpackingVector2549 			void Init(const uint8_t *bytes)
2550 			{
2551 				for (int i = 0; i < 4; i++)
2552 					m_vector[i] = 0;
2553 
2554 				for (int b = 0; b < 16; b++)
2555 					m_vector[b / 4] |= (bytes[b] << ((b % 4) * 8));
2556 			}
2557 
Unpackcvtt::Internal::UnpackingVector2558 			inline ParallelMath::ScalarUInt16 Unpack(int bits)
2559 			{
2560 				uint32_t bitMask = (1 << bits) - 1;
2561 
2562 				ParallelMath::ScalarUInt16 result = static_cast<ParallelMath::ScalarUInt16>(m_vector[0] & bitMask);
2563 
2564 				for (int i = 0; i < 4; i++)
2565 				{
2566 					m_vector[i] >>= bits;
2567 					if (i != 3)
2568 						m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - bits);
2569 				}
2570 
2571 				return result;
2572 			}
2573 		};
2574 
ComputeTweakFactors(int tweak,int range,float * outFactors)2575         void ComputeTweakFactors(int tweak, int range, float *outFactors)
2576         {
2577             int totalUnits = range - 1;
2578             int minOutsideUnits = ((tweak >> 1) & 1);
2579             int maxOutsideUnits = (tweak & 1);
2580             int insideUnits = totalUnits - minOutsideUnits - maxOutsideUnits;
2581 
2582             outFactors[0] = -static_cast<float>(minOutsideUnits) / static_cast<float>(insideUnits);
2583             outFactors[1] = static_cast<float>(maxOutsideUnits) / static_cast<float>(insideUnits) + 1.0f;
2584         }
2585 
ScaleHDRValue(const ParallelMath::Float & v,bool isSigned)2586         ParallelMath::Float ScaleHDRValue(const ParallelMath::Float &v, bool isSigned)
2587         {
2588             if (isSigned)
2589             {
2590                 ParallelMath::Float offset = ParallelMath::Select(ParallelMath::Less(v, ParallelMath::MakeFloatZero()), ParallelMath::MakeFloat(-30.0f), ParallelMath::MakeFloat(30.0f));
2591                 return (v * 32.0f + offset) / 31.0f;
2592             }
2593             else
2594                 return (v * 64.0f + 30.0f) / 31.0f;
2595         }
2596 
UnscaleHDRValueSigned(const ParallelMath::SInt16 & v)2597         ParallelMath::SInt16 UnscaleHDRValueSigned(const ParallelMath::SInt16 &v)
2598         {
2599 #ifdef CVTT_ENABLE_ASSERTS
2600             for (int i = 0; i < ParallelMath::ParallelSize; i++)
2601                 assert(ParallelMath::Extract(v, i) != -32768)
2602 #endif
2603 
2604             ParallelMath::Int16CompFlag negative = ParallelMath::Less(v, ParallelMath::MakeSInt16(0));
2605             ParallelMath::UInt15 absComp = ParallelMath::LosslessCast<ParallelMath::UInt15>::Cast(ParallelMath::Select(negative, ParallelMath::SInt16(ParallelMath::MakeSInt16(0) - v), v));
2606 
2607             ParallelMath::UInt31 multiplied = ParallelMath::XMultiply(absComp, ParallelMath::MakeUInt15(31));
2608             ParallelMath::UInt31 shifted = ParallelMath::RightShift(multiplied, 5);
2609             ParallelMath::UInt15 absCompScaled = ParallelMath::ToUInt15(shifted);
2610             ParallelMath::SInt16 signBits = ParallelMath::SelectOrZero(negative, ParallelMath::MakeSInt16(-32768));
2611 
2612             return ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(absCompScaled) | signBits;
2613         }
2614 
UnscaleHDRValueUnsigned(const ParallelMath::UInt16 & v)2615         ParallelMath::UInt15 UnscaleHDRValueUnsigned(const ParallelMath::UInt16 &v)
2616         {
2617             return ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(v, ParallelMath::MakeUInt15(31)), 6));
2618         }
2619 
UnscaleHDREndpoints(const ParallelMath::AInt16 inEP[2][3],ParallelMath::AInt16 outEP[2][3],bool isSigned)2620         void UnscaleHDREndpoints(const ParallelMath::AInt16 inEP[2][3], ParallelMath::AInt16 outEP[2][3], bool isSigned)
2621         {
2622             for (int epi = 0; epi < 2; epi++)
2623             {
2624                 for (int ch = 0; ch < 3; ch++)
2625                 {
2626                     if (isSigned)
2627                         outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueSigned(ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(inEP[epi][ch])));
2628                     else
2629                         outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::LosslessCast<ParallelMath::UInt16>::Cast(inEP[epi][ch])));
2630                 }
2631             }
2632         }
2633 
2634         template<int TVectorSize>
2635         class UnfinishedEndpoints
2636         {
2637         public:
2638             typedef ParallelMath::Float MFloat;
2639             typedef ParallelMath::UInt16 MUInt16;
2640             typedef ParallelMath::UInt15 MUInt15;
2641             typedef ParallelMath::SInt16 MSInt16;
2642             typedef ParallelMath::SInt32 MSInt32;
2643 
UnfinishedEndpoints()2644             UnfinishedEndpoints()
2645             {
2646             }
2647 
UnfinishedEndpoints(const MFloat * base,const MFloat * offset)2648             UnfinishedEndpoints(const MFloat *base, const MFloat *offset)
2649             {
2650                 for (int ch = 0; ch < TVectorSize; ch++)
2651                     m_base[ch] = base[ch];
2652                 for (int ch = 0; ch < TVectorSize; ch++)
2653                     m_offset[ch] = offset[ch];
2654             }
2655 
UnfinishedEndpoints(const UnfinishedEndpoints & other)2656             UnfinishedEndpoints(const UnfinishedEndpoints& other)
2657             {
2658                 for (int ch = 0; ch < TVectorSize; ch++)
2659                     m_base[ch] = other.m_base[ch];
2660                 for (int ch = 0; ch < TVectorSize; ch++)
2661                     m_offset[ch] = other.m_offset[ch];
2662             }
2663 
FinishHDRUnsigned(int tweak,int range,MSInt16 * outEP0,MSInt16 * outEP1,ParallelMath::RoundTowardNearestForScope * roundingMode)2664             void FinishHDRUnsigned(int tweak, int range, MSInt16 *outEP0, MSInt16 *outEP1, ParallelMath::RoundTowardNearestForScope *roundingMode)
2665             {
2666                 float tweakFactors[2];
2667                 ComputeTweakFactors(tweak, range, tweakFactors);
2668 
2669                 for (int ch = 0; ch < TVectorSize; ch++)
2670                 {
2671                     MUInt15 channelEPs[2];
2672                     for (int epi = 0; epi < 2; epi++)
2673                     {
2674                         MFloat f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[epi], 0.0f, 31743.0f);
2675                         channelEPs[epi] = ParallelMath::RoundAndConvertToU15(f, roundingMode);
2676                     }
2677 
2678                     outEP0[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(channelEPs[0]);
2679                     outEP1[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(channelEPs[1]);
2680                 }
2681             }
2682 
FinishHDRSigned(int tweak,int range,MSInt16 * outEP0,MSInt16 * outEP1,ParallelMath::RoundTowardNearestForScope * roundingMode)2683             void FinishHDRSigned(int tweak, int range, MSInt16* outEP0, MSInt16* outEP1, ParallelMath::RoundTowardNearestForScope* roundingMode)
2684             {
2685                 float tweakFactors[2];
2686                 ComputeTweakFactors(tweak, range, tweakFactors);
2687 
2688                 for (int ch = 0; ch < TVectorSize; ch++)
2689                 {
2690                     MSInt16 channelEPs[2];
2691                     for (int epi = 0; epi < 2; epi++)
2692                     {
2693                         MFloat f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[epi], -31743.0f, 31743.0f);
2694                         channelEPs[epi] = ParallelMath::RoundAndConvertToS16(f, roundingMode);
2695                     }
2696 
2697                     outEP0[ch] = channelEPs[0];
2698                     outEP1[ch] = channelEPs[1];
2699                 }
2700             }
2701 
FinishLDR(int tweak,int range,MUInt15 * outEP0,MUInt15 * outEP1)2702             void FinishLDR(int tweak, int range, MUInt15* outEP0, MUInt15* outEP1)
2703             {
2704                 ParallelMath::RoundTowardNearestForScope roundingMode;
2705 
2706                 float tweakFactors[2];
2707                 ComputeTweakFactors(tweak, range, tweakFactors);
2708 
2709                 for (int ch = 0; ch < TVectorSize; ch++)
2710                 {
2711                     MFloat ep0f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[0], 0.0f, 255.0f);
2712                     MFloat ep1f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[1], 0.0f, 255.0f);
2713                     outEP0[ch] = ParallelMath::RoundAndConvertToU15(ep0f, &roundingMode);
2714                     outEP1[ch] = ParallelMath::RoundAndConvertToU15(ep1f, &roundingMode);
2715                 }
2716             }
2717 
2718             template<int TNewVectorSize>
ExpandTo(float filler)2719             UnfinishedEndpoints<TNewVectorSize> ExpandTo(float filler)
2720             {
2721                 MFloat newBase[TNewVectorSize];
2722                 MFloat newOffset[TNewVectorSize];
2723 
2724                 for (int ch = 0; ch < TNewVectorSize && ch < TVectorSize; ch++)
2725                 {
2726                     newBase[ch] = m_base[ch];
2727                     newOffset[ch] = m_offset[ch];
2728                 }
2729 
2730                 MFloat fillerV = ParallelMath::MakeFloat(filler);
2731 
2732                 for (int ch = TVectorSize; ch < TNewVectorSize; ch++)
2733                 {
2734                     newBase[ch] = fillerV;
2735                     newOffset[ch] = ParallelMath::MakeFloatZero();
2736                 }
2737 
2738                 return UnfinishedEndpoints<TNewVectorSize>(newBase, newOffset);
2739             }
2740 
2741         private:
2742             MFloat m_base[TVectorSize];
2743             MFloat m_offset[TVectorSize];
2744         };
2745 
2746         template<int TMatrixSize>
2747         class PackedCovarianceMatrix
2748         {
2749         public:
2750             // 0: xx,
2751             // 1: xy, yy
2752             // 3: xz, yz, zz
2753             // 6: xw, yw, zw, ww
2754             // ... etc.
2755             static const int PyramidSize = (TMatrixSize * (TMatrixSize + 1)) / 2;
2756 
2757             typedef ParallelMath::Float MFloat;
2758 
PackedCovarianceMatrix()2759             PackedCovarianceMatrix()
2760             {
2761                 for (int i = 0; i < PyramidSize; i++)
2762                     m_values[i] = ParallelMath::MakeFloatZero();
2763             }
2764 
Add(const ParallelMath::Float * vec,const ParallelMath::Float & weight)2765             void Add(const ParallelMath::Float *vec, const ParallelMath::Float &weight)
2766             {
2767                 int index = 0;
2768                 for (int row = 0; row < TMatrixSize; row++)
2769                 {
2770                     for (int col = 0; col <= row; col++)
2771                     {
2772                         m_values[index] = m_values[index] + vec[row] * vec[col] * weight;
2773                         index++;
2774                     }
2775                 }
2776             }
2777 
Product(MFloat * outVec,const MFloat * inVec)2778             void Product(MFloat *outVec, const MFloat *inVec)
2779             {
2780                 for (int row = 0; row < TMatrixSize; row++)
2781                 {
2782                     MFloat sum = ParallelMath::MakeFloatZero();
2783 
2784                     int index = (row * (row + 1)) >> 1;
2785                     for (int col = 0; col < TMatrixSize; col++)
2786                     {
2787                         sum = sum + inVec[col] * m_values[index];
2788                         if (col >= row)
2789                             index += col + 1;
2790                         else
2791                             index++;
2792                     }
2793 
2794                     outVec[row] = sum;
2795                 }
2796             }
2797 
2798         private:
2799             ParallelMath::Float m_values[PyramidSize];
2800         };
2801 
2802         static const int NumEndpointSelectorPasses = 3;
2803 
2804         template<int TVectorSize, int TIterationCount>
2805         class EndpointSelector
2806         {
2807         public:
2808             typedef ParallelMath::Float MFloat;
2809 
EndpointSelector()2810             EndpointSelector()
2811             {
2812                 for (int ch = 0; ch < TVectorSize; ch++)
2813                 {
2814                     m_centroid[ch] = ParallelMath::MakeFloatZero();
2815                     m_direction[ch] = ParallelMath::MakeFloatZero();
2816                 }
2817                 m_weightTotal = ParallelMath::MakeFloatZero();
2818                 m_minDist = ParallelMath::MakeFloat(FLT_MAX);
2819                 m_maxDist = ParallelMath::MakeFloat(-FLT_MAX);
2820             }
2821 
ContributePass(const MFloat * value,int pass,const MFloat & weight)2822             void ContributePass(const MFloat *value, int pass, const MFloat &weight)
2823             {
2824                 if (pass == 0)
2825                     ContributeCentroid(value, weight);
2826                 else if (pass == 1)
2827                     ContributeDirection(value, weight);
2828                 else if (pass == 2)
2829                     ContributeMinMax(value);
2830             }
2831 
FinishPass(int pass)2832             void FinishPass(int pass)
2833             {
2834                 if (pass == 0)
2835                     FinishCentroid();
2836                 else if (pass == 1)
2837                     FinishDirection();
2838             }
2839 
GetEndpoints(const float channelWeights[TVectorSize]) const2840             UnfinishedEndpoints<TVectorSize> GetEndpoints(const float channelWeights[TVectorSize]) const
2841             {
2842                 MFloat unweightedBase[TVectorSize];
2843                 MFloat unweightedOffset[TVectorSize];
2844 
2845                 for (int ch = 0; ch < TVectorSize; ch++)
2846                 {
2847                     MFloat min = m_centroid[ch] + m_direction[ch] * m_minDist;
2848                     MFloat max = m_centroid[ch] + m_direction[ch] * m_maxDist;
2849 
2850                     float safeWeight = channelWeights[ch];
2851                     if (safeWeight == 0.f)
2852                         safeWeight = 1.0f;
2853 
2854                     unweightedBase[ch] = min / channelWeights[ch];
2855                     unweightedOffset[ch] = (max - min) / channelWeights[ch];
2856                 }
2857 
2858                 return UnfinishedEndpoints<TVectorSize>(unweightedBase, unweightedOffset);
2859             }
2860 
2861         private:
ContributeCentroid(const MFloat * value,const MFloat & weight)2862             void ContributeCentroid(const MFloat *value, const MFloat &weight)
2863             {
2864                 for (int ch = 0; ch < TVectorSize; ch++)
2865                     m_centroid[ch] = m_centroid[ch] + value[ch] * weight;
2866                 m_weightTotal = m_weightTotal + weight;
2867             }
2868 
FinishCentroid()2869             void FinishCentroid()
2870             {
2871                 MFloat denom = m_weightTotal;
2872                 ParallelMath::MakeSafeDenominator(denom);
2873 
2874                 for (int ch = 0; ch < TVectorSize; ch++)
2875                     m_centroid[ch] = m_centroid[ch] / denom;
2876             }
2877 
ContributeDirection(const MFloat * value,const MFloat & weight)2878             void ContributeDirection(const MFloat *value, const MFloat &weight)
2879             {
2880                 MFloat diff[TVectorSize];
2881                 for (int ch = 0; ch < TVectorSize; ch++)
2882                     diff[ch] = value[ch] - m_centroid[ch];
2883 
2884                 m_covarianceMatrix.Add(diff, weight);
2885             }
2886 
FinishDirection()2887             void FinishDirection()
2888             {
2889                 MFloat approx[TVectorSize];
2890                 for (int ch = 0; ch < TVectorSize; ch++)
2891                     approx[ch] = ParallelMath::MakeFloat(1.0f);
2892 
2893                 for (int i = 0; i < TIterationCount; i++)
2894                 {
2895                     MFloat product[TVectorSize];
2896                     m_covarianceMatrix.Product(product, approx);
2897 
2898                     MFloat largestComponent = product[0];
2899                     for (int ch = 1; ch < TVectorSize; ch++)
2900                         largestComponent = ParallelMath::Max(largestComponent, product[ch]);
2901 
2902                     // product = largestComponent*newApprox
2903                     ParallelMath::MakeSafeDenominator(largestComponent);
2904                     for (int ch = 0; ch < TVectorSize; ch++)
2905                         approx[ch] = product[ch] / largestComponent;
2906                 }
2907 
2908                 // Normalize
2909                 MFloat approxLen = ParallelMath::MakeFloatZero();
2910                 for (int ch = 0; ch < TVectorSize; ch++)
2911                     approxLen = approxLen + approx[ch] * approx[ch];
2912 
2913                 approxLen = ParallelMath::Sqrt(approxLen);
2914 
2915                 ParallelMath::MakeSafeDenominator(approxLen);
2916 
2917                 for (int ch = 0; ch < TVectorSize; ch++)
2918                     m_direction[ch] = approx[ch] / approxLen;
2919             }
2920 
ContributeMinMax(const MFloat * value)2921             void ContributeMinMax(const MFloat *value)
2922             {
2923                 MFloat dist = ParallelMath::MakeFloatZero();
2924                 for (int ch = 0; ch < TVectorSize; ch++)
2925                     dist = dist + m_direction[ch] * (value[ch] - m_centroid[ch]);
2926 
2927                 m_minDist = ParallelMath::Min(m_minDist, dist);
2928                 m_maxDist = ParallelMath::Max(m_maxDist, dist);
2929             }
2930 
2931             ParallelMath::Float m_centroid[TVectorSize];
2932             ParallelMath::Float m_direction[TVectorSize];
2933             PackedCovarianceMatrix<TVectorSize> m_covarianceMatrix;
2934             ParallelMath::Float m_weightTotal;
2935 
2936             ParallelMath::Float m_minDist;
2937             ParallelMath::Float m_maxDist;
2938         };
2939 
2940         static const ParallelMath::UInt16 g_weightReciprocals[] =
2941         {
2942             ParallelMath::MakeUInt16(0),        // -1
2943             ParallelMath::MakeUInt16(0),        // 0
2944             ParallelMath::MakeUInt16(32768),    // 1
2945             ParallelMath::MakeUInt16(16384),    // 2
2946             ParallelMath::MakeUInt16(10923),    // 3
2947             ParallelMath::MakeUInt16(8192),     // 4
2948             ParallelMath::MakeUInt16(6554),     // 5
2949             ParallelMath::MakeUInt16(5461),     // 6
2950             ParallelMath::MakeUInt16(4681),     // 7
2951             ParallelMath::MakeUInt16(4096),     // 8
2952             ParallelMath::MakeUInt16(3641),     // 9
2953             ParallelMath::MakeUInt16(3277),     // 10
2954             ParallelMath::MakeUInt16(2979),     // 11
2955             ParallelMath::MakeUInt16(2731),     // 12
2956             ParallelMath::MakeUInt16(2521),     // 13
2957             ParallelMath::MakeUInt16(2341),     // 14
2958             ParallelMath::MakeUInt16(2185),     // 15
2959         };
2960 
2961         template<int TVectorSize>
2962         class IndexSelector
2963         {
2964         public:
2965             typedef ParallelMath::Float MFloat;
2966             typedef ParallelMath::UInt16 MUInt16;
2967             typedef ParallelMath::UInt15 MUInt15;
2968             typedef ParallelMath::SInt16 MSInt16;
2969             typedef ParallelMath::AInt16 MAInt16;
2970             typedef ParallelMath::SInt32 MSInt32;
2971             typedef ParallelMath::UInt31 MUInt31;
2972 
2973             template<class TInterpolationEPType, class TColorEPType>
Init(const float * channelWeights,const TInterpolationEPType interpolationEndPoints[2][TVectorSize],const TColorEPType colorSpaceEndpoints[2][TVectorSize],int range)2974             void Init(const float *channelWeights, const TInterpolationEPType interpolationEndPoints[2][TVectorSize], const TColorEPType colorSpaceEndpoints[2][TVectorSize], int range)
2975             {
2976                 // In BC6H, the interpolation endpoints are higher-precision than the endpoints in color space.
2977                 // We need to select indexes using the color-space endpoints.
2978 
2979                 m_isUniform = true;
2980                 for (int ch = 1; ch < TVectorSize; ch++)
2981                 {
2982                     if (channelWeights[ch] != channelWeights[0])
2983                         m_isUniform = false;
2984                 }
2985 
2986                 // To work with channel weights, we need something where:
2987                 // pxDiff = px - ep[0]
2988                 // epDiff = ep[1] - ep[0]
2989                 //
2990                 // weightedEPDiff = epDiff * channelWeights
2991                 // normalizedWeightedAxis = weightedEPDiff / len(weightedEPDiff)
2992                 // normalizedIndex = dot(pxDiff * channelWeights, normalizedWeightedAxis) / len(weightedEPDiff)
2993                 // index = normalizedIndex * maxValue
2994                 //
2995                 // Equivalent to:
2996                 // axis = channelWeights * maxValue * epDiff * channelWeights / lenSquared(epDiff * channelWeights)
2997                 // index = dot(axis, pxDiff)
2998 
2999                 for (int ep = 0; ep < 2; ep++)
3000                     for (int ch = 0; ch < TVectorSize; ch++)
3001                         m_endPoint[ep][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(interpolationEndPoints[ep][ch]);
3002 
3003                 m_range = range;
3004                 m_maxValue = static_cast<float>(range - 1);
3005 
3006                 MFloat epDiffWeighted[TVectorSize];
3007                 for (int ch = 0; ch < TVectorSize; ch++)
3008                 {
3009                     m_origin[ch] = ParallelMath::ToFloat(colorSpaceEndpoints[0][ch]);
3010                     MFloat opposingOriginCh = ParallelMath::ToFloat(colorSpaceEndpoints[1][ch]);
3011                     epDiffWeighted[ch] = (opposingOriginCh - m_origin[ch]) * channelWeights[ch];
3012                 }
3013 
3014                 MFloat lenSquared = epDiffWeighted[0] * epDiffWeighted[0];
3015                 for (int ch = 1; ch < TVectorSize; ch++)
3016                     lenSquared = lenSquared + epDiffWeighted[ch] * epDiffWeighted[ch];
3017 
3018                 ParallelMath::MakeSafeDenominator(lenSquared);
3019 
3020                 MFloat maxValueDividedByLengthSquared = ParallelMath::MakeFloat(m_maxValue) / lenSquared;
3021 
3022                 for (int ch = 0; ch < TVectorSize; ch++)
3023                     m_axis[ch] = epDiffWeighted[ch] * channelWeights[ch] * maxValueDividedByLengthSquared;
3024             }
3025 
3026             template<bool TSigned>
Init(const float channelWeights[TVectorSize],const MUInt15 endPoints[2][TVectorSize],int range)3027             void Init(const float channelWeights[TVectorSize], const MUInt15 endPoints[2][TVectorSize], int range)
3028             {
3029                 MAInt16 converted[2][TVectorSize];
3030                 for (int epi = 0; epi < 2; epi++)
3031                     for (int ch = 0; ch < TVectorSize; ch++)
3032                         converted[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(endPoints[epi][ch]);
3033 
3034                 Init<MUInt15, MUInt15>(channelWeights, endPoints, endPoints, range);
3035             }
3036 
ReconstructLDR_BC7(const MUInt15 & index,MUInt15 * pixel,int numRealChannels)3037             void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
3038             {
3039                 MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
3040 
3041                 for (int ch = 0; ch < numRealChannels; ch++)
3042                 {
3043                     MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(64) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
3044                     MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
3045                     pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(32), 6));
3046                 }
3047             }
3048 
ReconstructLDRPrecise(const MUInt15 & index,MUInt15 * pixel,int numRealChannels)3049             void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
3050             {
3051                 MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 64, 7));
3052 
3053                 for (int ch = 0; ch < numRealChannels; ch++)
3054                 {
3055                     MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(256) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
3056                     MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
3057                     pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(128), 8));
3058                 }
3059             }
3060 
ReconstructLDR_BC7(const MUInt15 & index,MUInt15 * pixel)3061             void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel)
3062             {
3063                 ReconstructLDR_BC7(index, pixel, TVectorSize);
3064             }
3065 
ReconstructLDRPrecise(const MUInt15 & index,MUInt15 * pixel)3066             void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel)
3067             {
3068                 ReconstructLDRPrecise(index, pixel, TVectorSize);
3069             }
3070 
SelectIndexLDR(const MFloat * pixel,const ParallelMath::RoundTowardNearestForScope * rtn) const3071             MUInt15 SelectIndexLDR(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const
3072             {
3073                 MFloat dist = (pixel[0] - m_origin[0]) * m_axis[0];
3074                 for (int ch = 1; ch < TVectorSize; ch++)
3075                     dist = dist + (pixel[ch] - m_origin[ch]) * m_axis[ch];
3076 
3077                 return ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(dist, 0.0f, m_maxValue), rtn);
3078             }
3079 
3080         protected:
3081             MAInt16 m_endPoint[2][TVectorSize];
3082 
3083         private:
3084             MFloat m_origin[TVectorSize];
3085             MFloat m_axis[TVectorSize];
3086             int m_range;
3087             float m_maxValue;
3088             bool m_isUniform;
3089         };
3090 
3091 
3092         template<int TVectorSize>
3093         class IndexSelectorHDR : public IndexSelector<TVectorSize>
3094         {
3095         public:
3096             typedef ParallelMath::UInt15 MUInt15;
3097             typedef ParallelMath::UInt16 MUInt16;
3098             typedef ParallelMath::UInt31 MUInt31;
3099             typedef ParallelMath::SInt16 MSInt16;
3100             typedef ParallelMath::SInt32 MSInt32;
3101             typedef ParallelMath::Float MFloat;
3102 
3103         private:
3104 
InvertSingle(const MUInt15 & anIndex) const3105             MUInt15 InvertSingle(const MUInt15& anIndex) const
3106             {
3107                 MUInt15 inverted = m_maxValueMinusOne - anIndex;
3108                 return ParallelMath::Select(m_isInverted, inverted, anIndex);
3109             }
3110 
ReconstructHDRSignedUninverted(const MUInt15 & index,MSInt16 * pixel) const3111             void ReconstructHDRSignedUninverted(const MUInt15 &index, MSInt16* pixel) const
3112             {
3113                 MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
3114 
3115                 for (int ch = 0; ch < TVectorSize; ch++)
3116                 {
3117                     MSInt16 ep0 = ParallelMath::LosslessCast<MSInt16>::Cast(this->m_endPoint[0][ch]);
3118                     MSInt16 ep1 = ParallelMath::LosslessCast<MSInt16>::Cast(this->m_endPoint[1][ch]);
3119 
3120                     MSInt32 pixel32 = ParallelMath::XMultiply((ParallelMath::MakeUInt15(64) - weight), ep0) + ParallelMath::XMultiply(weight, ep1);
3121 
3122                     pixel32 = ParallelMath::RightShift(pixel32 + ParallelMath::MakeSInt32(32), 6);
3123 
3124                     pixel[ch] = UnscaleHDRValueSigned(ParallelMath::ToSInt16(pixel32));
3125                 }
3126             }
3127 
ReconstructHDRUnsignedUninverted(const MUInt15 & index,MSInt16 * pixel) const3128             void ReconstructHDRUnsignedUninverted(const MUInt15 &index, MSInt16* pixel) const
3129             {
3130                 MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
3131 
3132                 for (int ch = 0; ch < TVectorSize; ch++)
3133                 {
3134                     MUInt16 ep0 = ParallelMath::LosslessCast<MUInt16>::Cast(this->m_endPoint[0][ch]);
3135                     MUInt16 ep1 = ParallelMath::LosslessCast<MUInt16>::Cast(this->m_endPoint[1][ch]);
3136 
3137                     MUInt31 pixel31 = ParallelMath::XMultiply((ParallelMath::MakeUInt15(64) - weight), ep0) + ParallelMath::XMultiply(weight, ep1);
3138 
3139                     pixel31 = ParallelMath::RightShift(pixel31 + ParallelMath::MakeUInt31(32), 6);
3140 
3141                     pixel[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::ToUInt16(pixel31)));
3142                 }
3143             }
3144 
ErrorForInterpolatorComponent(int index,int ch,const MFloat * pixel) const3145             MFloat ErrorForInterpolatorComponent(int index, int ch, const MFloat *pixel) const
3146             {
3147                 MFloat diff = pixel[ch] - m_reconstructedInterpolators[index][ch];
3148                 return diff * diff;
3149             }
3150 
ErrorForInterpolator(int index,const MFloat * pixel) const3151             MFloat ErrorForInterpolator(int index, const MFloat *pixel) const
3152             {
3153                 MFloat error = ErrorForInterpolatorComponent(index, 0, pixel);
3154                 for (int ch = 1; ch < TVectorSize; ch++)
3155                     error = error + ErrorForInterpolatorComponent(index, ch, pixel);
3156                 return error;
3157             }
3158 
3159         public:
3160 
InitHDR(int range,bool isSigned,bool fastIndexing,const float * channelWeights)3161             void InitHDR(int range, bool isSigned, bool fastIndexing, const float *channelWeights)
3162             {
3163                 assert(range <= 16);
3164 
3165                 m_range = range;
3166 
3167                 m_isInverted = ParallelMath::MakeBoolInt16(false);
3168                 m_maxValueMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(range - 1));
3169 
3170                 if (!fastIndexing)
3171                 {
3172                     for (int i = 0; i < range; i++)
3173                     {
3174                         MSInt16 recon2CL[TVectorSize];
3175 
3176                         if (isSigned)
3177                             ReconstructHDRSignedUninverted(ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), recon2CL);
3178                         else
3179                             ReconstructHDRUnsignedUninverted(ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), recon2CL);
3180 
3181                         for (int ch = 0; ch < TVectorSize; ch++)
3182                             m_reconstructedInterpolators[i][ch] = ParallelMath::TwosCLHalfToFloat(recon2CL[ch]) * channelWeights[ch];
3183                     }
3184                 }
3185             }
3186 
ReconstructHDRSigned(const MUInt15 & index,MSInt16 * pixel) const3187             void ReconstructHDRSigned(const MUInt15 &index, MSInt16* pixel) const
3188             {
3189                 ReconstructHDRSignedUninverted(InvertSingle(index), pixel);
3190             }
3191 
ReconstructHDRUnsigned(const MUInt15 & index,MSInt16 * pixel) const3192             void ReconstructHDRUnsigned(const MUInt15 &index, MSInt16* pixel) const
3193             {
3194                 ReconstructHDRUnsignedUninverted(InvertSingle(index), pixel);
3195             }
3196 
ConditionalInvert(const ParallelMath::Int16CompFlag & invert)3197             void ConditionalInvert(const ParallelMath::Int16CompFlag &invert)
3198             {
3199                 m_isInverted = invert;
3200             }
3201 
SelectIndexHDRSlow(const MFloat * pixel,const ParallelMath::RoundTowardNearestForScope *) const3202             MUInt15 SelectIndexHDRSlow(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope*) const
3203             {
3204                 MUInt15 index = ParallelMath::MakeUInt15(0);
3205 
3206                 MFloat bestError = ErrorForInterpolator(0, pixel);
3207                 for (int i = 1; i < m_range; i++)
3208                 {
3209                     MFloat error = ErrorForInterpolator(i, pixel);
3210                     ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
3211                     ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
3212                     bestError = ParallelMath::Min(bestError, error);
3213                 }
3214 
3215                 return InvertSingle(index);
3216             }
3217 
SelectIndexHDRFast(const MFloat * pixel,const ParallelMath::RoundTowardNearestForScope * rtn) const3218             MUInt15 SelectIndexHDRFast(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const
3219             {
3220                 return InvertSingle(this->SelectIndexLDR(pixel, rtn));
3221             }
3222 
3223         private:
3224             MFloat m_reconstructedInterpolators[16][TVectorSize];
3225             ParallelMath::Int16CompFlag m_isInverted;
3226             MUInt15 m_maxValueMinusOne;
3227             int m_range;
3228         };
3229 
3230         // Solve for a, b where v = a*t + b
3231         // This allows endpoints to be mapped to where T=0 and T=1
3232         // Least squares from totals:
3233         // a = (tv - t*v/w)/(tt - t*t/w)
3234         // b = (v - a*t)/w
3235         template<int TVectorSize>
3236         class EndpointRefiner
3237         {
3238         public:
3239             typedef ParallelMath::Float MFloat;
3240             typedef ParallelMath::UInt16 MUInt16;
3241             typedef ParallelMath::UInt15 MUInt15;
3242             typedef ParallelMath::AInt16 MAInt16;
3243             typedef ParallelMath::SInt16 MSInt16;
3244             typedef ParallelMath::SInt32 MSInt32;
3245 
3246             MFloat m_tv[TVectorSize];
3247             MFloat m_v[TVectorSize];
3248             MFloat m_tt;
3249             MFloat m_t;
3250             MFloat m_w;
3251             int m_wu;
3252 
3253             float m_rcpMaxIndex;
3254             float m_channelWeights[TVectorSize];
3255             float m_rcpChannelWeights[TVectorSize];
3256 
Init(int indexRange,const float channelWeights[TVectorSize])3257             void Init(int indexRange, const float channelWeights[TVectorSize])
3258             {
3259                 for (int ch = 0; ch < TVectorSize; ch++)
3260                 {
3261                     m_tv[ch] = ParallelMath::MakeFloatZero();
3262                     m_v[ch] = ParallelMath::MakeFloatZero();
3263                 }
3264                 m_tt = ParallelMath::MakeFloatZero();
3265                 m_t = ParallelMath::MakeFloatZero();
3266                 m_w = ParallelMath::MakeFloatZero();
3267 
3268                 m_rcpMaxIndex = 1.0f / static_cast<float>(indexRange - 1);
3269 
3270                 for (int ch = 0; ch < TVectorSize; ch++)
3271                 {
3272                     m_channelWeights[ch] = channelWeights[ch];
3273                     m_rcpChannelWeights[ch] = 1.0f;
3274                     if (m_channelWeights[ch] != 0.0f)
3275                         m_rcpChannelWeights[ch] = 1.0f / channelWeights[ch];
3276                 }
3277 
3278                 m_wu = 0;
3279             }
3280 
ContributePW(const MFloat * pwFloatPixel,const MUInt15 & index,const MFloat & weight)3281             void ContributePW(const MFloat *pwFloatPixel, const MUInt15 &index, const MFloat &weight)
3282             {
3283                 MFloat t = ParallelMath::ToFloat(index) * m_rcpMaxIndex;
3284 
3285                 for (int ch = 0; ch < TVectorSize; ch++)
3286                 {
3287                     MFloat v = pwFloatPixel[ch] * weight;
3288 
3289                     m_tv[ch] = m_tv[ch] + t * v;
3290                     m_v[ch] = m_v[ch] + v;
3291                 }
3292                 m_tt = m_tt + weight * t * t;
3293                 m_t = m_t + weight * t;
3294                 m_w = m_w + weight;
3295             }
3296 
ContributeUnweightedPW(const MFloat * pwFloatPixel,const MUInt15 & index,int numRealChannels)3297             void ContributeUnweightedPW(const MFloat *pwFloatPixel, const MUInt15 &index, int numRealChannels)
3298             {
3299                 MFloat t = ParallelMath::ToFloat(index) * m_rcpMaxIndex;
3300 
3301                 for (int ch = 0; ch < numRealChannels; ch++)
3302                 {
3303                     MFloat v = pwFloatPixel[ch];
3304 
3305                     m_tv[ch] = m_tv[ch] + t * v;
3306                     m_v[ch] = m_v[ch] + v;
3307                 }
3308                 m_tt = m_tt + t * t;
3309                 m_t = m_t + t;
3310                 m_wu++;
3311             }
3312 
ContributeUnweightedPW(const MFloat * floatPixel,const MUInt15 & index)3313             void ContributeUnweightedPW(const MFloat *floatPixel, const MUInt15 &index)
3314             {
3315                 ContributeUnweightedPW(floatPixel, index, TVectorSize);
3316             }
3317 
GetRefinedEndpoints(MFloat endPoint[2][TVectorSize])3318             void GetRefinedEndpoints(MFloat endPoint[2][TVectorSize])
3319             {
3320                 // a = (tv - t*v/w)/(tt - t*t/w)
3321                 // b = (v - a*t)/w
3322                 MFloat w = m_w + ParallelMath::MakeFloat(static_cast<float>(m_wu));
3323 
3324                 ParallelMath::MakeSafeDenominator(w);
3325                 MFloat wRcp = ParallelMath::Reciprocal(w);
3326 
3327                 MFloat adenom = (m_tt * w - m_t * m_t) * wRcp;
3328 
3329                 ParallelMath::FloatCompFlag adenomZero = ParallelMath::Equal(adenom, ParallelMath::MakeFloatZero());
3330                 ParallelMath::ConditionalSet(adenom, adenomZero, ParallelMath::MakeFloat(1.0f));
3331 
3332                 for (int ch = 0; ch < TVectorSize; ch++)
3333                 {
3334                     /*
3335                     if (adenom == 0.0)
3336                         p1 = p2 = er.v / er.w;
3337                     else
3338                     {
3339                         float4 a = (er.tv - er.t*er.v / er.w) / adenom;
3340                         float4 b = (er.v - a * er.t) / er.w;
3341                         p1 = b;
3342                         p2 = a + b;
3343                     }
3344                     */
3345 
3346                     MFloat a = (m_tv[ch] - m_t * m_v[ch] * wRcp) / adenom;
3347                     MFloat b = (m_v[ch] - a * m_t) * wRcp;
3348 
3349                     MFloat p1 = b;
3350                     MFloat p2 = a + b;
3351 
3352                     ParallelMath::ConditionalSet(p1, adenomZero, (m_v[ch] * wRcp));
3353                     ParallelMath::ConditionalSet(p2, adenomZero, p1);
3354 
3355                     // Unweight
3356                     float inverseWeight = m_rcpChannelWeights[ch];
3357 
3358                     endPoint[0][ch] = p1 * inverseWeight;
3359                     endPoint[1][ch] = p2 * inverseWeight;
3360                 }
3361             }
3362 
GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize],int numRealChannels,const ParallelMath::RoundTowardNearestForScope * roundingMode)3363             void GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize], int numRealChannels, const ParallelMath::RoundTowardNearestForScope *roundingMode)
3364             {
3365                 MFloat floatEndPoint[2][TVectorSize];
3366                 GetRefinedEndpoints(floatEndPoint);
3367 
3368                 for (int epi = 0; epi < 2; epi++)
3369                     for (int ch = 0; ch < TVectorSize; ch++)
3370                         endPoint[epi][ch] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(floatEndPoint[epi][ch], 0.0f, 255.0f), roundingMode);
3371             }
3372 
GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize],const ParallelMath::RoundTowardNearestForScope * roundingMode)3373             void GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize], const ParallelMath::RoundTowardNearestForScope *roundingMode)
3374             {
3375                 GetRefinedEndpointsLDR(endPoint, TVectorSize, roundingMode);
3376             }
3377 
GetRefinedEndpointsHDR(MSInt16 endPoint[2][TVectorSize],bool isSigned,const ParallelMath::RoundTowardNearestForScope * roundingMode)3378             void GetRefinedEndpointsHDR(MSInt16 endPoint[2][TVectorSize], bool isSigned, const ParallelMath::RoundTowardNearestForScope *roundingMode)
3379             {
3380                 MFloat floatEndPoint[2][TVectorSize];
3381                 GetRefinedEndpoints(floatEndPoint);
3382 
3383                 for (int epi = 0; epi < 2; epi++)
3384                 {
3385                     for (int ch = 0; ch < TVectorSize; ch++)
3386                     {
3387                         MFloat f = floatEndPoint[epi][ch];
3388                         if (isSigned)
3389                             endPoint[epi][ch] = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RoundAndConvertToS16(ParallelMath::Clamp(f, -31743.0f, 31743.0f), roundingMode));
3390                         else
3391                             endPoint[epi][ch] = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(f, 0.0f, 31743.0f), roundingMode));
3392                     }
3393                 }
3394             }
3395         };
3396 
3397         template<int TVectorSize>
3398         class AggregatedError
3399         {
3400         public:
3401             typedef ParallelMath::UInt16 MUInt16;
3402             typedef ParallelMath::UInt31 MUInt31;
3403             typedef ParallelMath::Float MFloat;
3404 
AggregatedError()3405             AggregatedError()
3406             {
3407                 for (int ch = 0; ch < TVectorSize; ch++)
3408                     m_errorUnweighted[ch] = ParallelMath::MakeUInt31(0);
3409             }
3410 
Add(const MUInt16 & channelErrorUnweighted,int ch)3411             void Add(const MUInt16 &channelErrorUnweighted, int ch)
3412             {
3413                 m_errorUnweighted[ch] = m_errorUnweighted[ch] + ParallelMath::ToUInt31(channelErrorUnweighted);
3414             }
3415 
Finalize(uint32_t flags,const float channelWeightsSq[TVectorSize]) const3416             MFloat Finalize(uint32_t flags, const float channelWeightsSq[TVectorSize]) const
3417             {
3418                 if (flags & cvtt::Flags::Uniform)
3419                 {
3420                     MUInt31 total = m_errorUnweighted[0];
3421                     for (int ch = 1; ch < TVectorSize; ch++)
3422                         total = total + m_errorUnweighted[ch];
3423                     return ParallelMath::ToFloat(total);
3424                 }
3425                 else
3426                 {
3427                     MFloat total = ParallelMath::ToFloat(m_errorUnweighted[0]) * channelWeightsSq[0];
3428                     for (int ch = 1; ch < TVectorSize; ch++)
3429                         total = total + ParallelMath::ToFloat(m_errorUnweighted[ch]) * channelWeightsSq[ch];
3430                     return total;
3431                 }
3432             }
3433 
3434         private:
3435             MUInt31 m_errorUnweighted[TVectorSize];
3436         };
3437 
3438         class BCCommon
3439         {
3440         public:
3441             typedef ParallelMath::Float MFloat;
3442             typedef ParallelMath::UInt16 MUInt16;
3443             typedef ParallelMath::UInt15 MUInt15;
3444             typedef ParallelMath::AInt16 MAInt16;
3445             typedef ParallelMath::SInt16 MSInt16;
3446             typedef ParallelMath::SInt32 MSInt32;
3447 
TweakRoundsForRange(int range)3448             static int TweakRoundsForRange(int range)
3449             {
3450                 if (range == 3)
3451                     return 3;
3452                 return 4;
3453             }
3454 
3455             template<int TVectorSize>
ComputeErrorLDR(uint32_t flags,const MUInt15 reconstructed[TVectorSize],const MUInt15 original[TVectorSize],int numRealChannels,AggregatedError<TVectorSize> & aggError)3456             static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, AggregatedError<TVectorSize> &aggError)
3457             {
3458                 for (int ch = 0; ch < numRealChannels; ch++)
3459                     aggError.Add(ParallelMath::SqDiffUInt8(reconstructed[ch], original[ch]), ch);
3460             }
3461 
3462             template<int TVectorSize>
ComputeErrorLDR(uint32_t flags,const MUInt15 reconstructed[TVectorSize],const MUInt15 original[TVectorSize],AggregatedError<TVectorSize> & aggError)3463             static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], AggregatedError<TVectorSize> &aggError)
3464             {
3465                 ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, TVectorSize, aggError);
3466             }
3467 
3468             template<int TVectorSize>
ComputeErrorLDRSimple(uint32_t flags,const MUInt15 reconstructed[TVectorSize],const MUInt15 original[TVectorSize],int numRealChannels,const float * channelWeightsSq)3469             static MFloat ComputeErrorLDRSimple(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, const float *channelWeightsSq)
3470             {
3471                 AggregatedError<TVectorSize> aggError;
3472                 ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, numRealChannels, aggError);
3473                 return aggError.Finalize(flags, channelWeightsSq);
3474             }
3475 
3476             template<int TVectorSize>
ComputeErrorHDRFast(uint32_t flags,const MSInt16 reconstructed[TVectorSize],const MSInt16 original[TVectorSize],const float channelWeightsSq[TVectorSize])3477             static MFloat ComputeErrorHDRFast(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize])
3478             {
3479                 MFloat error = ParallelMath::MakeFloatZero();
3480                 if (flags & Flags::Uniform)
3481                 {
3482                     for (int ch = 0; ch < TVectorSize; ch++)
3483                         error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]);
3484                 }
3485                 else
3486                 {
3487                     for (int ch = 0; ch < TVectorSize; ch++)
3488                         error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]);
3489                 }
3490 
3491                 return error;
3492             }
3493 
3494             template<int TVectorSize>
ComputeErrorHDRSlow(uint32_t flags,const MSInt16 reconstructed[TVectorSize],const MSInt16 original[TVectorSize],const float channelWeightsSq[TVectorSize])3495             static MFloat ComputeErrorHDRSlow(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize])
3496             {
3497                 MFloat error = ParallelMath::MakeFloatZero();
3498                 if (flags & Flags::Uniform)
3499                 {
3500                     for (int ch = 0; ch < TVectorSize; ch++)
3501                         error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]);
3502                 }
3503                 else
3504                 {
3505                     for (int ch = 0; ch < TVectorSize; ch++)
3506                         error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]);
3507                 }
3508 
3509                 return error;
3510             }
3511 
3512             template<int TChannelCount>
PreWeightPixelsLDR(MFloat preWeightedPixels[16][TChannelCount],const MUInt15 pixels[16][TChannelCount],const float channelWeights[TChannelCount])3513             static void PreWeightPixelsLDR(MFloat preWeightedPixels[16][TChannelCount], const MUInt15 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
3514             {
3515                 for (int px = 0; px < 16; px++)
3516                 {
3517                     for (int ch = 0; ch < TChannelCount; ch++)
3518                         preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch];
3519                 }
3520             }
3521 
3522             template<int TChannelCount>
PreWeightPixelsHDR(MFloat preWeightedPixels[16][TChannelCount],const MSInt16 pixels[16][TChannelCount],const float channelWeights[TChannelCount])3523             static void PreWeightPixelsHDR(MFloat preWeightedPixels[16][TChannelCount], const MSInt16 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
3524             {
3525                 for (int px = 0; px < 16; px++)
3526                 {
3527                     for (int ch = 0; ch < TChannelCount; ch++)
3528                         preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch];
3529                 }
3530             }
3531         };
3532 
3533         class BC7Computer
3534         {
3535         public:
3536             static const int MaxTweakRounds = 4;
3537 
3538             typedef ParallelMath::SInt16 MSInt16;
3539             typedef ParallelMath::UInt15 MUInt15;
3540             typedef ParallelMath::UInt16 MUInt16;
3541             typedef ParallelMath::SInt32 MSInt32;
3542             typedef ParallelMath::Float MFloat;
3543 
3544             struct WorkInfo
3545             {
3546                 MUInt15 m_mode;
3547                 MFloat m_error;
3548                 MUInt15 m_ep[3][2][4];
3549                 MUInt15 m_indexes[16];
3550                 MUInt15 m_indexes2[16];
3551 
3552                 union
3553                 {
3554                     MUInt15 m_partition;
3555                     struct IndexSelectorAndRotation
3556                     {
3557                         MUInt15 m_indexSelector;
3558                         MUInt15 m_rotation;
3559                     } m_isr;
3560                 } m_u;
3561             };
3562 
TweakAlpha(const MUInt15 original[2],int tweak,int range,MUInt15 result[2])3563             static void TweakAlpha(const MUInt15 original[2], int tweak, int range, MUInt15 result[2])
3564             {
3565                 ParallelMath::RoundTowardNearestForScope roundingMode;
3566 
3567                 float tf[2];
3568                 ComputeTweakFactors(tweak, range, tf);
3569 
3570                 MFloat base = ParallelMath::ToFloat(original[0]);
3571                 MFloat offs = ParallelMath::ToFloat(original[1]) - base;
3572 
3573                 result[0] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[0], 0.0f, 255.0f), &roundingMode);
3574                 result[1] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[1], 0.0f, 255.0f), &roundingMode);
3575             }
3576 
Quantize(MUInt15 * color,int bits,int channels,const ParallelMath::RoundTowardNearestForScope * roundingMode)3577             static void Quantize(MUInt15* color, int bits, int channels, const ParallelMath::RoundTowardNearestForScope *roundingMode)
3578             {
3579                 float maxColor = static_cast<float>((1 << bits) - 1);
3580 
3581                 for (int i = 0; i < channels; i++)
3582                     color[i] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(ParallelMath::ToFloat(color[i]) * ParallelMath::MakeFloat(1.0f / 255.0f) * maxColor, 0.f, 255.f), roundingMode);
3583             }
3584 
QuantizeP(MUInt15 * color,int bits,uint16_t p,int channels,const ParallelMath::RoundTowardNearestForScope * roundingMode)3585             static void QuantizeP(MUInt15* color, int bits, uint16_t p, int channels, const ParallelMath::RoundTowardNearestForScope *roundingMode)
3586             {
3587                 uint16_t pShift = static_cast<uint16_t>(1 << (7 - bits));
3588                 MUInt15 pShiftV = ParallelMath::MakeUInt15(pShift);
3589 
3590                 float maxColorF = static_cast<float>(255 - (1 << (7 - bits)));
3591 
3592                 float maxQuantized = static_cast<float>((1 << bits) - 1);
3593 
3594                 for (int ch = 0; ch < channels; ch++)
3595                 {
3596                     MUInt15 clr = color[ch];
3597                     if (p)
3598                         clr = ParallelMath::Max(clr, pShiftV) - pShiftV;
3599 
3600                     MFloat rerangedColor = ParallelMath::ToFloat(clr) * maxQuantized / maxColorF;
3601 
3602                     clr = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(rerangedColor, 0.0f, maxQuantized), roundingMode) << 1;
3603                     if (p)
3604                         clr = clr | ParallelMath::MakeUInt15(1);
3605 
3606                     color[ch] = clr;
3607                 }
3608             }
3609 
Unquantize(MUInt15 * color,int bits,int channels)3610             static void Unquantize(MUInt15* color, int bits, int channels)
3611             {
3612                 for (int ch = 0; ch < channels; ch++)
3613                 {
3614                     MUInt15 clr = color[ch];
3615                     clr = clr << (8 - bits);
3616                     color[ch] = clr | ParallelMath::RightShift(clr, bits);
3617                 }
3618             }
3619 
CompressEndpoints0(MUInt15 ep[2][4],uint16_t p[2],const ParallelMath::RoundTowardNearestForScope * roundingMode)3620             static void CompressEndpoints0(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
3621             {
3622                 for (int j = 0; j < 2; j++)
3623                 {
3624                     QuantizeP(ep[j], 4, p[j], 3, roundingMode);
3625                     Unquantize(ep[j], 5, 3);
3626                     ep[j][3] = ParallelMath::MakeUInt15(255);
3627                 }
3628             }
3629 
CompressEndpoints1(MUInt15 ep[2][4],uint16_t p,const ParallelMath::RoundTowardNearestForScope * roundingMode)3630             static void CompressEndpoints1(MUInt15 ep[2][4], uint16_t p, const ParallelMath::RoundTowardNearestForScope *roundingMode)
3631             {
3632                 for (int j = 0; j < 2; j++)
3633                 {
3634                     QuantizeP(ep[j], 6, p, 3, roundingMode);
3635                     Unquantize(ep[j], 7, 3);
3636                     ep[j][3] = ParallelMath::MakeUInt15(255);
3637                 }
3638             }
3639 
CompressEndpoints2(MUInt15 ep[2][4],const ParallelMath::RoundTowardNearestForScope * roundingMode)3640             static void CompressEndpoints2(MUInt15 ep[2][4], const ParallelMath::RoundTowardNearestForScope *roundingMode)
3641             {
3642                 for (int j = 0; j < 2; j++)
3643                 {
3644                     Quantize(ep[j], 5, 3, roundingMode);
3645                     Unquantize(ep[j], 5, 3);
3646                     ep[j][3] = ParallelMath::MakeUInt15(255);
3647                 }
3648             }
3649 
CompressEndpoints3(MUInt15 ep[2][4],uint16_t p[2],const ParallelMath::RoundTowardNearestForScope * roundingMode)3650             static void CompressEndpoints3(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
3651             {
3652                 for (int j = 0; j < 2; j++)
3653                 {
3654                     QuantizeP(ep[j], 7, p[j], 3, roundingMode);
3655                     ep[j][3] = ParallelMath::MakeUInt15(255);
3656                 }
3657             }
3658 
CompressEndpoints4(MUInt15 epRGB[2][3],MUInt15 epA[2],const ParallelMath::RoundTowardNearestForScope * roundingMode)3659             static void CompressEndpoints4(MUInt15 epRGB[2][3], MUInt15 epA[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
3660             {
3661                 for (int j = 0; j < 2; j++)
3662                 {
3663                     Quantize(epRGB[j], 5, 3, roundingMode);
3664                     Unquantize(epRGB[j], 5, 3);
3665 
3666                     Quantize(epA + j, 6, 1, roundingMode);
3667                     Unquantize(epA + j, 6, 1);
3668                 }
3669             }
3670 
CompressEndpoints5(MUInt15 epRGB[2][3],MUInt15 epA[2],const ParallelMath::RoundTowardNearestForScope * roundingMode)3671             static void CompressEndpoints5(MUInt15 epRGB[2][3], MUInt15 epA[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
3672             {
3673                 for (int j = 0; j < 2; j++)
3674                 {
3675                     Quantize(epRGB[j], 7, 3, roundingMode);
3676                     Unquantize(epRGB[j], 7, 3);
3677                 }
3678 
3679                 // Alpha is full precision
3680                 (void)epA;
3681             }
3682 
CompressEndpoints6(MUInt15 ep[2][4],uint16_t p[2],const ParallelMath::RoundTowardNearestForScope * roundingMode)3683             static void CompressEndpoints6(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
3684             {
3685                 for (int j = 0; j < 2; j++)
3686                     QuantizeP(ep[j], 7, p[j], 4, roundingMode);
3687             }
3688 
CompressEndpoints7(MUInt15 ep[2][4],uint16_t p[2],const ParallelMath::RoundTowardNearestForScope * roundingMode)3689             static void CompressEndpoints7(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
3690             {
3691                 for (int j = 0; j < 2; j++)
3692                 {
3693                     QuantizeP(ep[j], 5, p[j], 4, roundingMode);
3694                     Unquantize(ep[j], 6, 4);
3695                 }
3696             }
3697 
3698             struct SinglePlaneTemporaries
3699             {
3700                 UnfinishedEndpoints<3> unfinishedRGB[BC7Data::g_numShapesAll];
3701                 UnfinishedEndpoints<4> unfinishedRGBA[BC7Data::g_numShapes12];
3702 
3703                 MUInt15 fragmentBestIndexes[BC7Data::g_numFragments];
3704                 MUInt15 shapeBestEP[BC7Data::g_maxFragmentsPerMode][2][4];
3705                 MFloat shapeBestError[BC7Data::g_maxFragmentsPerMode];
3706             };
3707 
TrySingleColorRGBAMultiTable(uint32_t flags,const MUInt15 pixels[16][4],const MFloat average[4],int numRealChannels,const uint8_t * fragmentStart,int shapeLength,const MFloat & staticAlphaError,const ParallelMath::Int16CompFlag punchThroughInvalid[4],MFloat & shapeBestError,MUInt15 shapeBestEP[2][4],MUInt15 * fragmentBestIndexes,const float * channelWeightsSq,const cvtt::Tables::BC7SC::Table * const * tables,int numTables,const ParallelMath::RoundTowardNearestForScope * rtn)3708             static void TrySingleColorRGBAMultiTable(uint32_t flags, const MUInt15 pixels[16][4], const MFloat average[4], int numRealChannels, const uint8_t *fragmentStart, int shapeLength, const MFloat &staticAlphaError, const ParallelMath::Int16CompFlag punchThroughInvalid[4], MFloat& shapeBestError, MUInt15 shapeBestEP[2][4], MUInt15 *fragmentBestIndexes, const float *channelWeightsSq, const cvtt::Tables::BC7SC::Table*const* tables, int numTables, const ParallelMath::RoundTowardNearestForScope *rtn)
3709             {
3710                 MFloat bestAverageError = ParallelMath::MakeFloat(FLT_MAX);
3711 
3712                 MUInt15 intAverage[4];
3713                 for (int ch = 0; ch < 4; ch++)
3714                     intAverage[ch] = ParallelMath::RoundAndConvertToU15(average[ch], rtn);
3715 
3716                 MUInt15 eps[2][4];
3717                 MUInt15 reconstructed[4];
3718                 MUInt15 index = ParallelMath::MakeUInt15(0);
3719 
3720                 for (int epi = 0; epi < 2; epi++)
3721                 {
3722                     for (int ch = 0; ch < 3; ch++)
3723                         eps[epi][ch] = ParallelMath::MakeUInt15(0);
3724                     eps[epi][3] = ParallelMath::MakeUInt15(255);
3725                 }
3726 
3727                 for (int ch = 0; ch < 3; ch++)
3728                     reconstructed[ch] = ParallelMath::MakeUInt15(0);
3729                 reconstructed[3] = ParallelMath::MakeUInt15(255);
3730 
3731                 // Depending on the target index and parity bits, there are multiple valid solid colors.
3732                 // We want to find the one closest to the actual average.
3733                 MFloat epsAverageDiff = ParallelMath::MakeFloat(FLT_MAX);
3734                 for (int t = 0; t < numTables; t++)
3735                 {
3736                     const cvtt::Tables::BC7SC::Table& table = *(tables[t]);
3737 
3738                     ParallelMath::Int16CompFlag pti = punchThroughInvalid[table.m_pBits];
3739 
3740                     MUInt15 candidateReconstructed[4];
3741                     MUInt15 candidateEPs[2][4];
3742 
3743                     for (int i = 0; i < ParallelMath::ParallelSize; i++)
3744                     {
3745                         for (int ch = 0; ch < numRealChannels; ch++)
3746                         {
3747                             ParallelMath::ScalarUInt16 avgValue = ParallelMath::Extract(intAverage[ch], i);
3748                             assert(avgValue >= 0 && avgValue <= 255);
3749 
3750                             const cvtt::Tables::BC7SC::TableEntry &entry = table.m_entries[avgValue];
3751 
3752                             ParallelMath::PutUInt15(candidateEPs[0][ch], i, entry.m_min);
3753                             ParallelMath::PutUInt15(candidateEPs[1][ch], i, entry.m_max);
3754                             ParallelMath::PutUInt15(candidateReconstructed[ch], i, entry.m_actualColor);
3755                         }
3756                     }
3757 
3758                     MFloat avgError = ParallelMath::MakeFloatZero();
3759                     for (int ch = 0; ch < numRealChannels; ch++)
3760                     {
3761                         MFloat delta = ParallelMath::ToFloat(candidateReconstructed[ch]) - average[ch];
3762                         avgError = avgError + delta * delta * channelWeightsSq[ch];
3763                     }
3764 
3765                     ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(avgError, bestAverageError));
3766                     better = ParallelMath::AndNot(pti, better); // Mask out punch-through invalidations
3767 
3768                     if (ParallelMath::AnySet(better))
3769                     {
3770                         ParallelMath::ConditionalSet(bestAverageError, ParallelMath::Int16FlagToFloat(better), avgError);
3771 
3772                         MUInt15 candidateIndex = ParallelMath::MakeUInt15(table.m_index);
3773 
3774                         ParallelMath::ConditionalSet(index, better, candidateIndex);
3775 
3776                         for (int ch = 0; ch < numRealChannels; ch++)
3777                             ParallelMath::ConditionalSet(reconstructed[ch], better, candidateReconstructed[ch]);
3778 
3779                         for (int epi = 0; epi < 2; epi++)
3780                             for (int ch = 0; ch < numRealChannels; ch++)
3781                                 ParallelMath::ConditionalSet(eps[epi][ch], better, candidateEPs[epi][ch]);
3782                     }
3783                 }
3784 
3785                 AggregatedError<4> aggError;
3786                 for (int pxi = 0; pxi < shapeLength; pxi++)
3787                 {
3788                     int px = fragmentStart[pxi];
3789 
3790                     BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
3791                 }
3792 
3793                 MFloat error = aggError.Finalize(flags, channelWeightsSq) + staticAlphaError;
3794 
3795                 ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, shapeBestError));
3796                 if (ParallelMath::AnySet(better))
3797                 {
3798                     shapeBestError = ParallelMath::Min(shapeBestError, error);
3799                     for (int epi = 0; epi < 2; epi++)
3800                     {
3801                         for (int ch = 0; ch < numRealChannels; ch++)
3802                             ParallelMath::ConditionalSet(shapeBestEP[epi][ch], better, eps[epi][ch]);
3803                     }
3804 
3805                     for (int pxi = 0; pxi < shapeLength; pxi++)
3806                         ParallelMath::ConditionalSet(fragmentBestIndexes[pxi], better, index);
3807                 }
3808             }
3809 
3810 
TrySinglePlane(uint32_t flags,const MUInt15 pixels[16][4],const MFloat floatPixels[16][4],const float channelWeights[4],int numTweakRounds,int numRefineRounds,WorkInfo & work,const ParallelMath::RoundTowardNearestForScope * rtn)3811             static void TrySinglePlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], int numTweakRounds, int numRefineRounds, WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
3812             {
3813                 if (numRefineRounds < 1)
3814                     numRefineRounds = 1;
3815 
3816                 if (numTweakRounds < 1)
3817                     numTweakRounds = 1;
3818                 else if (numTweakRounds > MaxTweakRounds)
3819                     numTweakRounds = MaxTweakRounds;
3820 
3821                 float channelWeightsSq[4];
3822 
3823                 for (int ch = 0; ch < 4; ch++)
3824                     channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
3825 
3826                 SinglePlaneTemporaries temps;
3827 
3828                 MUInt15 maxAlpha = ParallelMath::MakeUInt15(0);
3829                 MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
3830                 ParallelMath::Int16CompFlag isPunchThrough = ParallelMath::MakeBoolInt16(true);
3831                 for (int px = 0; px < 16; px++)
3832                 {
3833                     MUInt15 a = pixels[px][3];
3834                     maxAlpha = ParallelMath::Max(maxAlpha, a);
3835                     minAlpha = ParallelMath::Min(minAlpha, a);
3836 
3837                     isPunchThrough = (isPunchThrough & (ParallelMath::Equal(a, ParallelMath::MakeUInt15(0)) | ParallelMath::Equal(a, ParallelMath::MakeUInt15(255))));
3838                 }
3839 
3840                 ParallelMath::Int16CompFlag blockHasNonMaxAlpha = ParallelMath::Less(minAlpha, ParallelMath::MakeUInt15(255));
3841                 ParallelMath::Int16CompFlag blockHasNonZeroAlpha = ParallelMath::Less(ParallelMath::MakeUInt15(0), maxAlpha);
3842 
3843                 bool anyBlockHasAlpha = ParallelMath::AnySet(blockHasNonMaxAlpha);
3844 
3845                 // Try RGB modes if any block has a min alpha 251 or higher
3846                 bool allowRGBModes = ParallelMath::AnySet(ParallelMath::Less(ParallelMath::MakeUInt15(250), minAlpha));
3847 
3848                 // Try mode 7 if any block has alpha.
3849                 // Mode 7 is almost never selected for RGB blocks because mode 4 has very accurate 7.7.7.1 endpoints
3850                 // and its parity bit doesn't affect alpha, meaning mode 7 can only be better in extremely specific
3851                 // situations, and only by at most 1 unit of error per pixel.
3852                 bool allowMode7 = anyBlockHasAlpha;
3853 
3854                 MFloat preWeightedPixels[16][4];
3855 
3856                 BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
3857 
3858                 const int *rgbInitialEPCollapseList = NULL;
3859 
3860                 // Get initial RGB endpoints
3861                 if (allowRGBModes)
3862                 {
3863                     const int *shapeList;
3864                     int numShapesToEvaluate;
3865 
3866                     if (flags & Flags::BC7_EnablePartitioning)
3867                     {
3868                         if (flags & Flags::BC7_Enable3Subsets)
3869                         {
3870                             shapeList = BC7Data::g_shapeListAll;
3871                             rgbInitialEPCollapseList = BC7Data::g_shapeListAll;
3872                             numShapesToEvaluate = BC7Data::g_numShapesAll;
3873                         }
3874                         else
3875                         {
3876                             shapeList = BC7Data::g_shapeList12;
3877                             rgbInitialEPCollapseList = BC7Data::g_shapeList12Collapse;
3878                             numShapesToEvaluate = BC7Data::g_numShapes12;
3879                         }
3880                     }
3881                     else
3882                     {
3883                         shapeList = BC7Data::g_shapeList1;
3884                         rgbInitialEPCollapseList = BC7Data::g_shapeList1Collapse;
3885                         numShapesToEvaluate = BC7Data::g_numShapes1;
3886                     }
3887 
3888                     for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
3889                     {
3890                         int shape = shapeList[shapeIter];
3891 
3892                         int shapeStart = BC7Data::g_shapeRanges[shape][0];
3893                         int shapeSize = BC7Data::g_shapeRanges[shape][1];
3894 
3895                         EndpointSelector<3, 8> epSelector;
3896 
3897                         for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
3898                         {
3899                             for (int spx = 0; spx < shapeSize; spx++)
3900                             {
3901                                 int px = BC7Data::g_fragments[shapeStart + spx];
3902                                 epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
3903                             }
3904                             epSelector.FinishPass(epPass);
3905                         }
3906                         temps.unfinishedRGB[shapeIter] = epSelector.GetEndpoints(channelWeights);
3907                     }
3908                 }
3909 
3910                 const int *rgbaInitialEPCollapseList = BC7Data::g_shapeList12Collapse;
3911 
3912                 // Get initial RGBA endpoints
3913                 {
3914                     const int *shapeList = BC7Data::g_shapeList12;
3915                     int numShapesToEvaluate = BC7Data::g_numShapes12;
3916 
3917                     for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
3918                     {
3919                         int shape = shapeList[shapeIter];
3920 
3921                         if (anyBlockHasAlpha || !allowRGBModes)
3922                         {
3923                             int shapeStart = BC7Data::g_shapeRanges[shape][0];
3924                             int shapeSize = BC7Data::g_shapeRanges[shape][1];
3925 
3926                             EndpointSelector<4, 8> epSelector;
3927 
3928                             for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
3929                             {
3930                                 for (int spx = 0; spx < shapeSize; spx++)
3931                                 {
3932                                     int px = BC7Data::g_fragments[shapeStart + spx];
3933                                     epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
3934                                 }
3935                                 epSelector.FinishPass(epPass);
3936                             }
3937                             temps.unfinishedRGBA[shapeIter] = epSelector.GetEndpoints(channelWeights);
3938                         }
3939                         else
3940                         {
3941                             temps.unfinishedRGBA[shapeIter] = temps.unfinishedRGB[rgbInitialEPCollapseList[shape]].ExpandTo<4>(255);
3942                         }
3943                     }
3944                 }
3945 
3946                 for (uint16_t mode = 0; mode <= 7; mode++)
3947                 {
3948                     if (!(flags & Flags::BC7_EnablePartitioning) && BC7Data::g_modes[mode].m_numSubsets != 1)
3949                         continue;
3950 
3951                     if (!(flags & Flags::BC7_Enable3Subsets) && BC7Data::g_modes[mode].m_numSubsets == 3)
3952                         continue;
3953 
3954                     if (mode == 4 || mode == 5)
3955                         continue;
3956 
3957                     if (mode < 4 && !allowRGBModes)
3958                         continue;
3959 
3960                     if (mode == 7 && !allowMode7)
3961                         continue;
3962 
3963                     bool isRGB = (mode < 4);
3964 
3965                     unsigned int numPartitions = 1 << BC7Data::g_modes[mode].m_partitionBits;
3966                     int numSubsets = BC7Data::g_modes[mode].m_numSubsets;
3967                     int indexPrec = BC7Data::g_modes[mode].m_indexBits;
3968 
3969                     int parityBitMax = 1;
3970                     if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerEndpoint)
3971                         parityBitMax = 4;
3972                     else if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerSubset)
3973                         parityBitMax = 2;
3974 
3975                     int numRealChannels = isRGB ? 3 : 4;
3976 
3977                     int numShapes;
3978                     const int *shapeList;
3979                     const int *shapeCollapseList;
3980 
3981                     if (numSubsets == 1)
3982                     {
3983                         numShapes = BC7Data::g_numShapes1;
3984                         shapeList = BC7Data::g_shapeList1;
3985                         shapeCollapseList = BC7Data::g_shapeList1Collapse;
3986                     }
3987                     else if (numSubsets == 2)
3988                     {
3989                         numShapes = BC7Data::g_numShapes2;
3990                         shapeList = BC7Data::g_shapeList2;
3991                         shapeCollapseList = BC7Data::g_shapeList2Collapse;
3992                     }
3993                     else
3994                     {
3995                         assert(numSubsets == 3);
3996                         if (numPartitions == 16)
3997                         {
3998                             numShapes = BC7Data::g_numShapes3Short;
3999                             shapeList = BC7Data::g_shapeList3Short;
4000                             shapeCollapseList = BC7Data::g_shapeList3ShortCollapse;
4001                         }
4002                         else
4003                         {
4004                             assert(numPartitions == 64);
4005                             numShapes = BC7Data::g_numShapes3;
4006                             shapeList = BC7Data::g_shapeList3;
4007                             shapeCollapseList = BC7Data::g_shapeList3Collapse;
4008                         }
4009                     }
4010 
4011                     for (int slot = 0; slot < BC7Data::g_maxFragmentsPerMode; slot++)
4012                         temps.shapeBestError[slot] = ParallelMath::MakeFloat(FLT_MAX);
4013 
4014                     for (int shapeIter = 0; shapeIter < numShapes; shapeIter++)
4015                     {
4016                         int shape = shapeList[shapeIter];
4017                         int shapeStart = BC7Data::g_shapeRanges[shape][0];
4018                         int shapeLength = BC7Data::g_shapeRanges[shape][1];
4019                         int shapeCollapsedEvalIndex = shapeCollapseList[shape];
4020 
4021                         AggregatedError<1> alphaAggError;
4022                         if (isRGB && anyBlockHasAlpha)
4023                         {
4024                             MUInt15 filledAlpha[1] = { ParallelMath::MakeUInt15(255) };
4025 
4026                             for (int pxi = 0; pxi < shapeLength; pxi++)
4027                             {
4028                                 int px = BC7Data::g_fragments[shapeStart + pxi];
4029                                 MUInt15 original[1] = { pixels[px][3] };
4030                                 BCCommon::ComputeErrorLDR<1>(flags, filledAlpha, original, alphaAggError);
4031                             }
4032                         }
4033 
4034                         float alphaWeightsSq[1] = { channelWeightsSq[3] };
4035                         MFloat staticAlphaError = alphaAggError.Finalize(flags, alphaWeightsSq);
4036 
4037                         assert(shapeCollapsedEvalIndex >= 0);
4038 
4039                         MUInt15 tweakBaseEP[MaxTweakRounds][2][4];
4040 
4041                         for (int tweak = 0; tweak < numTweakRounds; tweak++)
4042                         {
4043                             if (isRGB)
4044                             {
4045                                 temps.unfinishedRGB[rgbInitialEPCollapseList[shape]].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
4046                                 tweakBaseEP[tweak][0][3] = tweakBaseEP[tweak][1][3] = ParallelMath::MakeUInt15(255);
4047                             }
4048                             else
4049                             {
4050                                 temps.unfinishedRGBA[rgbaInitialEPCollapseList[shape]].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
4051                             }
4052                         }
4053 
4054                         ParallelMath::Int16CompFlag punchThroughInvalid[4];
4055                         for (int pIter = 0; pIter < parityBitMax; pIter++)
4056                         {
4057                             punchThroughInvalid[pIter] = ParallelMath::MakeBoolInt16(false);
4058 
4059                             if ((flags & Flags::BC7_RespectPunchThrough) && (mode == 6 || mode == 7))
4060                             {
4061                                 // Modes 6 and 7 have parity bits that affect alpha
4062                                 if (pIter == 0)
4063                                     punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonZeroAlpha);
4064                                 else if (pIter == parityBitMax - 1)
4065                                     punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonMaxAlpha);
4066                                 else
4067                                     punchThroughInvalid[pIter] = isPunchThrough;
4068                             }
4069                         }
4070 
4071                         for (int pIter = 0; pIter < parityBitMax; pIter++)
4072                         {
4073                             if (ParallelMath::AllSet(punchThroughInvalid[pIter]))
4074                                 continue;
4075 
4076                             bool needPunchThroughCheck = ParallelMath::AnySet(punchThroughInvalid[pIter]);
4077 
4078                             for (int tweak = 0; tweak < numTweakRounds; tweak++)
4079                             {
4080                                 uint16_t p[2];
4081                                 p[0] = (pIter & 1);
4082                                 p[1] = ((pIter >> 1) & 1);
4083 
4084                                 MUInt15 ep[2][4];
4085 
4086                                 for (int epi = 0; epi < 2; epi++)
4087                                     for (int ch = 0; ch < 4; ch++)
4088                                         ep[epi][ch] = tweakBaseEP[tweak][epi][ch];
4089 
4090                                 for (int refine = 0; refine < numRefineRounds; refine++)
4091                                 {
4092                                     switch (mode)
4093                                     {
4094                                     case 0:
4095                                         CompressEndpoints0(ep, p, rtn);
4096                                         break;
4097                                     case 1:
4098                                         CompressEndpoints1(ep, p[0], rtn);
4099                                         break;
4100                                     case 2:
4101                                         CompressEndpoints2(ep, rtn);
4102                                         break;
4103                                     case 3:
4104                                         CompressEndpoints3(ep, p, rtn);
4105                                         break;
4106                                     case 6:
4107                                         CompressEndpoints6(ep, p, rtn);
4108                                         break;
4109                                     case 7:
4110                                         CompressEndpoints7(ep, p, rtn);
4111                                         break;
4112                                     default:
4113                                         assert(false);
4114                                         break;
4115                                     };
4116 
4117                                     MFloat shapeError = ParallelMath::MakeFloatZero();
4118 
4119                                     IndexSelector<4> indexSelector;
4120                                     indexSelector.Init<false>(channelWeights, ep, 1 << indexPrec);
4121 
4122                                     EndpointRefiner<4> epRefiner;
4123                                     epRefiner.Init(1 << indexPrec, channelWeights);
4124 
4125                                     MUInt15 indexes[16];
4126 
4127                                     AggregatedError<4> aggError;
4128                                     for (int pxi = 0; pxi < shapeLength; pxi++)
4129                                     {
4130                                         int px = BC7Data::g_fragments[shapeStart + pxi];
4131 
4132                                         MUInt15 index;
4133                                         MUInt15 reconstructed[4];
4134 
4135                                         index = indexSelector.SelectIndexLDR(floatPixels[px], rtn);
4136                                         indexSelector.ReconstructLDR_BC7(index, reconstructed, numRealChannels);
4137 
4138                                         if (flags & cvtt::Flags::BC7_FastIndexing)
4139                                             BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
4140                                         else
4141                                         {
4142                                             MFloat error = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
4143 
4144                                             MUInt15 altIndexes[2];
4145                                             altIndexes[0] = ParallelMath::Max(index, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
4146                                             altIndexes[1] = ParallelMath::Min(index + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << indexPrec) - 1)));
4147 
4148                                             for (int ii = 0; ii < 2; ii++)
4149                                             {
4150                                                 indexSelector.ReconstructLDR_BC7(altIndexes[ii], reconstructed, numRealChannels);
4151 
4152                                                 MFloat altError = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
4153                                                 ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altError, error));
4154                                                 error = ParallelMath::Min(error, altError);
4155                                                 ParallelMath::ConditionalSet(index, better, altIndexes[ii]);
4156                                             }
4157 
4158                                             shapeError = shapeError + error;
4159                                         }
4160 
4161                                         if (refine != numRefineRounds - 1)
4162                                             epRefiner.ContributeUnweightedPW(preWeightedPixels[px], index, numRealChannels);
4163 
4164                                         indexes[pxi] = index;
4165                                     }
4166 
4167                                     if (flags & cvtt::Flags::BC7_FastIndexing)
4168                                         shapeError = aggError.Finalize(flags, channelWeightsSq);
4169 
4170                                     if (isRGB)
4171                                         shapeError = shapeError + staticAlphaError;
4172 
4173                                     ParallelMath::FloatCompFlag shapeErrorBetter;
4174                                     ParallelMath::Int16CompFlag shapeErrorBetter16;
4175 
4176                                     shapeErrorBetter = ParallelMath::Less(shapeError, temps.shapeBestError[shapeCollapsedEvalIndex]);
4177                                     shapeErrorBetter16 = ParallelMath::FloatFlagToInt16(shapeErrorBetter);
4178 
4179                                     if (ParallelMath::AnySet(shapeErrorBetter16))
4180                                     {
4181                                         bool punchThroughOK = true;
4182                                         if (needPunchThroughCheck)
4183                                         {
4184                                             shapeErrorBetter16 = ParallelMath::AndNot(punchThroughInvalid[pIter], shapeErrorBetter16);
4185                                             shapeErrorBetter = ParallelMath::Int16FlagToFloat(shapeErrorBetter16);
4186 
4187                                             if (!ParallelMath::AnySet(shapeErrorBetter16))
4188                                                 punchThroughOK = false;
4189                                         }
4190 
4191                                         if (punchThroughOK)
4192                                         {
4193                                             ParallelMath::ConditionalSet(temps.shapeBestError[shapeCollapsedEvalIndex], shapeErrorBetter, shapeError);
4194                                             for (int epi = 0; epi < 2; epi++)
4195                                                 for (int ch = 0; ch < numRealChannels; ch++)
4196                                                     ParallelMath::ConditionalSet(temps.shapeBestEP[shapeCollapsedEvalIndex][epi][ch], shapeErrorBetter16, ep[epi][ch]);
4197 
4198                                             for (int pxi = 0; pxi < shapeLength; pxi++)
4199                                                 ParallelMath::ConditionalSet(temps.fragmentBestIndexes[shapeStart + pxi], shapeErrorBetter16, indexes[pxi]);
4200                                         }
4201                                     }
4202 
4203                                     if (refine != numRefineRounds - 1)
4204                                         epRefiner.GetRefinedEndpointsLDR(ep, numRealChannels, rtn);
4205                                 } // refine
4206                             } // tweak
4207                         } // p
4208 
4209                         if (flags & cvtt::Flags::BC7_TrySingleColor)
4210                         {
4211                             MUInt15 total[4];
4212                             for (int ch = 0; ch < 4; ch++)
4213                                 total[ch] = ParallelMath::MakeUInt15(0);
4214 
4215                             for (int pxi = 0; pxi < shapeLength; pxi++)
4216                             {
4217                                 int px = BC7Data::g_fragments[shapeStart + pxi];
4218                                 for (int ch = 0; ch < 4; ch++)
4219                                     total[ch] = total[ch] + pixels[pxi][ch];
4220                             }
4221 
4222                             MFloat rcpShapeLength = ParallelMath::MakeFloat(1.0f / static_cast<float>(shapeLength));
4223                             MFloat average[4];
4224                             for (int ch = 0; ch < 4; ch++)
4225                                 average[ch] = ParallelMath::ToFloat(total[ch]) * rcpShapeLength;
4226 
4227                             const uint8_t *fragment = BC7Data::g_fragments + shapeStart;
4228                             MFloat &shapeBestError = temps.shapeBestError[shapeCollapsedEvalIndex];
4229                             MUInt15(&shapeBestEP)[2][4] = temps.shapeBestEP[shapeCollapsedEvalIndex];
4230                             MUInt15 *fragmentBestIndexes = temps.fragmentBestIndexes + shapeStart;
4231 
4232                             const cvtt::Tables::BC7SC::Table **scTables = NULL;
4233                             int numSCTables = 0;
4234 
4235                             switch (mode)
4236                             {
4237                             case 0:
4238                                 {
4239                                     const cvtt::Tables::BC7SC::Table *tables[] =
4240                                     {
4241                                         &cvtt::Tables::BC7SC::g_mode0_p00_i1,
4242                                         &cvtt::Tables::BC7SC::g_mode0_p00_i2,
4243                                         &cvtt::Tables::BC7SC::g_mode0_p00_i3,
4244                                         &cvtt::Tables::BC7SC::g_mode0_p01_i1,
4245                                         &cvtt::Tables::BC7SC::g_mode0_p01_i2,
4246                                         &cvtt::Tables::BC7SC::g_mode0_p01_i3,
4247                                         &cvtt::Tables::BC7SC::g_mode0_p10_i1,
4248                                         &cvtt::Tables::BC7SC::g_mode0_p10_i2,
4249                                         &cvtt::Tables::BC7SC::g_mode0_p10_i3,
4250                                         &cvtt::Tables::BC7SC::g_mode0_p11_i1,
4251                                         &cvtt::Tables::BC7SC::g_mode0_p11_i2,
4252                                         &cvtt::Tables::BC7SC::g_mode0_p11_i3,
4253                                     };
4254                                     scTables = tables;
4255                                     numSCTables = sizeof(tables) / sizeof(tables[0]);
4256                                 }
4257                                 break;
4258                             case 1:
4259                                 {
4260                                     const cvtt::Tables::BC7SC::Table *tables[] =
4261                                     {
4262                                         &cvtt::Tables::BC7SC::g_mode1_p0_i1,
4263                                         &cvtt::Tables::BC7SC::g_mode1_p0_i2,
4264                                         &cvtt::Tables::BC7SC::g_mode1_p0_i3,
4265                                         &cvtt::Tables::BC7SC::g_mode1_p1_i1,
4266                                         &cvtt::Tables::BC7SC::g_mode1_p1_i2,
4267                                         &cvtt::Tables::BC7SC::g_mode1_p1_i3,
4268                                     };
4269                                     scTables = tables;
4270                                     numSCTables = sizeof(tables) / sizeof(tables[0]);
4271                                 }
4272                                 break;
4273                             case 2:
4274                                 {
4275                                     const cvtt::Tables::BC7SC::Table *tables[] =
4276                                     {
4277                                         &cvtt::Tables::BC7SC::g_mode2,
4278                                     };
4279                                     scTables = tables;
4280                                     numSCTables = sizeof(tables) / sizeof(tables[0]);
4281                                 }
4282                                 break;
4283                             case 3:
4284                                 {
4285                                     const cvtt::Tables::BC7SC::Table *tables[] =
4286                                     {
4287                                         &cvtt::Tables::BC7SC::g_mode3_p0,
4288                                         &cvtt::Tables::BC7SC::g_mode3_p1,
4289                                     };
4290                                     scTables = tables;
4291                                     numSCTables = sizeof(tables) / sizeof(tables[0]);
4292                                 }
4293                                 break;
4294                             case 6:
4295                                 {
4296                                     const cvtt::Tables::BC7SC::Table *tables[] =
4297                                     {
4298                                         &cvtt::Tables::BC7SC::g_mode6_p0_i1,
4299                                         &cvtt::Tables::BC7SC::g_mode6_p0_i2,
4300                                         &cvtt::Tables::BC7SC::g_mode6_p0_i3,
4301                                         &cvtt::Tables::BC7SC::g_mode6_p0_i4,
4302                                         &cvtt::Tables::BC7SC::g_mode6_p0_i5,
4303                                         &cvtt::Tables::BC7SC::g_mode6_p0_i6,
4304                                         &cvtt::Tables::BC7SC::g_mode6_p0_i7,
4305                                         &cvtt::Tables::BC7SC::g_mode6_p1_i1,
4306                                         &cvtt::Tables::BC7SC::g_mode6_p1_i2,
4307                                         &cvtt::Tables::BC7SC::g_mode6_p1_i3,
4308                                         &cvtt::Tables::BC7SC::g_mode6_p1_i4,
4309                                         &cvtt::Tables::BC7SC::g_mode6_p1_i5,
4310                                         &cvtt::Tables::BC7SC::g_mode6_p1_i6,
4311                                         &cvtt::Tables::BC7SC::g_mode6_p1_i7,
4312                                     };
4313                                     scTables = tables;
4314                                     numSCTables = sizeof(tables) / sizeof(tables[0]);
4315                                 }
4316                                 break;
4317                             case 7:
4318                                 {
4319                                     const cvtt::Tables::BC7SC::Table *tables[] =
4320                                     {
4321                                         &cvtt::Tables::BC7SC::g_mode7_p00,
4322                                         &cvtt::Tables::BC7SC::g_mode7_p01,
4323                                         &cvtt::Tables::BC7SC::g_mode7_p10,
4324                                         &cvtt::Tables::BC7SC::g_mode7_p11,
4325                                     };
4326                                     scTables = tables;
4327                                     numSCTables = sizeof(tables) / sizeof(tables[0]);
4328                                 }
4329                                 break;
4330                             default:
4331                                 assert(false);
4332                                 break;
4333                             }
4334 
4335                             TrySingleColorRGBAMultiTable(flags, pixels, average, numRealChannels, fragment, shapeLength, staticAlphaError, punchThroughInvalid, shapeBestError, shapeBestEP, fragmentBestIndexes, channelWeightsSq, scTables, numSCTables, rtn);
4336                         }
4337                     } // shapeIter
4338 
4339                     for (uint16_t partition = 0; partition < numPartitions; partition++)
4340                     {
4341                         const int *partitionShapes;
4342                         if (numSubsets == 1)
4343                             partitionShapes = BC7Data::g_shapes1[partition];
4344                         else if (numSubsets == 2)
4345                             partitionShapes = BC7Data::g_shapes2[partition];
4346                         else
4347                         {
4348                             assert(numSubsets == 3);
4349                             partitionShapes = BC7Data::g_shapes3[partition];
4350                         }
4351 
4352                         MFloat totalError = ParallelMath::MakeFloatZero();
4353                         for (int subset = 0; subset < numSubsets; subset++)
4354                             totalError = totalError + temps.shapeBestError[shapeCollapseList[partitionShapes[subset]]];
4355 
4356                         ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(totalError, work.m_error);
4357                         ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
4358 
4359                         if (ParallelMath::AnySet(errorBetter16))
4360                         {
4361                             for (int subset = 0; subset < numSubsets; subset++)
4362                             {
4363                                 int shape = partitionShapes[subset];
4364                                 int shapeStart = BC7Data::g_shapeRanges[shape][0];
4365                                 int shapeLength = BC7Data::g_shapeRanges[shape][1];
4366                                 int shapeCollapsedEvalIndex = shapeCollapseList[shape];
4367 
4368                                 for (int epi = 0; epi < 2; epi++)
4369                                     for (int ch = 0; ch < 4; ch++)
4370                                         ParallelMath::ConditionalSet(work.m_ep[subset][epi][ch], errorBetter16, temps.shapeBestEP[shapeCollapsedEvalIndex][epi][ch]);
4371 
4372                                 for (int pxi = 0; pxi < shapeLength; pxi++)
4373                                 {
4374                                     int px = BC7Data::g_fragments[shapeStart + pxi];
4375                                     ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, temps.fragmentBestIndexes[shapeStart + pxi]);
4376                                 }
4377                             }
4378 
4379                             work.m_error = ParallelMath::Min(totalError, work.m_error);
4380                             ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
4381                             ParallelMath::ConditionalSet(work.m_u.m_partition, errorBetter16, ParallelMath::MakeUInt15(partition));
4382                         }
4383                     }
4384                 }
4385             }
4386 
TryDualPlane(uint32_t flags,const MUInt15 pixels[16][4],const MFloat floatPixels[16][4],const float channelWeights[4],int numTweakRounds,int numRefineRounds,WorkInfo & work,const ParallelMath::RoundTowardNearestForScope * rtn)4387             static void TryDualPlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], int numTweakRounds, int numRefineRounds, WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
4388             {
4389                 // TODO: These error calculations are not optimal for weight-by-alpha, but this routine needs to be mostly rewritten for that.
4390                 // The alpha/color solutions are co-dependent in that case, but a good way to solve it would probably be to
4391                 // solve the alpha channel first, then solve the RGB channels, which in turn breaks down into two cases:
4392                 // - Separate alpha channel, then weighted RGB
4393                 // - Alpha+2 other channels, then the independent channel
4394 
4395                 if (!(flags & Flags::BC7_EnableDualPlane))
4396                     return;
4397 
4398                 if (numRefineRounds < 1)
4399                     numRefineRounds = 1;
4400 
4401                 if (numTweakRounds < 1)
4402                     numTweakRounds = 1;
4403                 else if (numTweakRounds > MaxTweakRounds)
4404                     numTweakRounds = MaxTweakRounds;
4405 
4406                 float channelWeightsSq[4];
4407                 for (int ch = 0; ch < 4; ch++)
4408                     channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
4409 
4410                 for (uint16_t mode = 4; mode <= 5; mode++)
4411                 {
4412                     for (uint16_t rotation = 0; rotation < 4; rotation++)
4413                     {
4414                         int alphaChannel = (rotation + 3) & 3;
4415                         int redChannel = (rotation == 1) ? 3 : 0;
4416                         int greenChannel = (rotation == 2) ? 3 : 1;
4417                         int blueChannel = (rotation == 3) ? 3 : 2;
4418 
4419                         MUInt15 rotatedRGB[16][3];
4420                         MFloat floatRotatedRGB[16][3];
4421 
4422                         for (int px = 0; px < 16; px++)
4423                         {
4424                             rotatedRGB[px][0] = pixels[px][redChannel];
4425                             rotatedRGB[px][1] = pixels[px][greenChannel];
4426                             rotatedRGB[px][2] = pixels[px][blueChannel];
4427 
4428                             for (int ch = 0; ch < 3; ch++)
4429                                 floatRotatedRGB[px][ch] = ParallelMath::ToFloat(rotatedRGB[px][ch]);
4430                         }
4431 
4432                         uint16_t maxIndexSelector = (mode == 4) ? 2 : 1;
4433 
4434                         float rotatedRGBWeights[3] = { channelWeights[redChannel], channelWeights[greenChannel], channelWeights[blueChannel] };
4435                         float rotatedRGBWeightsSq[3] = { channelWeightsSq[redChannel], channelWeightsSq[greenChannel], channelWeightsSq[blueChannel] };
4436                         float rotatedAlphaWeight[1] = { channelWeights[alphaChannel] };
4437                         float rotatedAlphaWeightSq[1] = { channelWeightsSq[alphaChannel] };
4438 
4439                         float uniformWeight[1] = { 1.0f };   // Since the alpha channel is independent, there's no need to bother with weights when doing refinement or selection, only error
4440 
4441                         MFloat preWeightedRotatedRGB[16][3];
4442                         BCCommon::PreWeightPixelsLDR<3>(preWeightedRotatedRGB, rotatedRGB, rotatedRGBWeights);
4443 
4444                         for (uint16_t indexSelector = 0; indexSelector < maxIndexSelector; indexSelector++)
4445                         {
4446                             EndpointSelector<3, 8> rgbSelector;
4447 
4448                             for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
4449                             {
4450                                 for (int px = 0; px < 16; px++)
4451                                     rgbSelector.ContributePass(preWeightedRotatedRGB[px], epPass, ParallelMath::MakeFloat(1.0f));
4452 
4453                                 rgbSelector.FinishPass(epPass);
4454                             }
4455 
4456                             MUInt15 alphaRange[2];
4457 
4458                             alphaRange[0] = alphaRange[1] = pixels[0][alphaChannel];
4459                             for (int px = 1; px < 16; px++)
4460                             {
4461                                 alphaRange[0] = ParallelMath::Min(pixels[px][alphaChannel], alphaRange[0]);
4462                                 alphaRange[1] = ParallelMath::Max(pixels[px][alphaChannel], alphaRange[1]);
4463                             }
4464 
4465                             int rgbPrec = 0;
4466                             int alphaPrec = 0;
4467 
4468                             if (mode == 4)
4469                             {
4470                                 rgbPrec = indexSelector ? 3 : 2;
4471                                 alphaPrec = indexSelector ? 2 : 3;
4472                             }
4473                             else
4474                                 rgbPrec = alphaPrec = 2;
4475 
4476                             UnfinishedEndpoints<3> unfinishedRGB = rgbSelector.GetEndpoints(rotatedRGBWeights);
4477 
4478                             MFloat bestRGBError = ParallelMath::MakeFloat(FLT_MAX);
4479                             MFloat bestAlphaError = ParallelMath::MakeFloat(FLT_MAX);
4480 
4481                             MUInt15 bestRGBIndexes[16];
4482                             MUInt15 bestAlphaIndexes[16];
4483                             MUInt15 bestEP[2][4];
4484 
4485                             for (int px = 0; px < 16; px++)
4486                                 bestRGBIndexes[px] = bestAlphaIndexes[px] = ParallelMath::MakeUInt15(0);
4487 
4488                             for (int tweak = 0; tweak < numTweakRounds; tweak++)
4489                             {
4490                                 MUInt15 rgbEP[2][3];
4491                                 MUInt15 alphaEP[2];
4492 
4493                                 unfinishedRGB.FinishLDR(tweak, 1 << rgbPrec, rgbEP[0], rgbEP[1]);
4494 
4495                                 TweakAlpha(alphaRange, tweak, 1 << alphaPrec, alphaEP);
4496 
4497                                 for (int refine = 0; refine < numRefineRounds; refine++)
4498                                 {
4499                                     if (mode == 4)
4500                                         CompressEndpoints4(rgbEP, alphaEP, rtn);
4501                                     else
4502                                         CompressEndpoints5(rgbEP, alphaEP, rtn);
4503 
4504 
4505                                     IndexSelector<1> alphaIndexSelector;
4506                                     IndexSelector<3> rgbIndexSelector;
4507 
4508                                     {
4509                                         MUInt15 alphaEPTemp[2][1] = { { alphaEP[0] },{ alphaEP[1] } };
4510                                         alphaIndexSelector.Init<false>(uniformWeight, alphaEPTemp, 1 << alphaPrec);
4511                                     }
4512                                     rgbIndexSelector.Init<false>(rotatedRGBWeights, rgbEP, 1 << rgbPrec);
4513 
4514                                     EndpointRefiner<3> rgbRefiner;
4515                                     EndpointRefiner<1> alphaRefiner;
4516 
4517                                     rgbRefiner.Init(1 << rgbPrec, rotatedRGBWeights);
4518                                     alphaRefiner.Init(1 << alphaPrec, uniformWeight);
4519 
4520                                     MFloat errorRGB = ParallelMath::MakeFloatZero();
4521                                     MFloat errorA = ParallelMath::MakeFloatZero();
4522 
4523                                     MUInt15 rgbIndexes[16];
4524                                     MUInt15 alphaIndexes[16];
4525 
4526                                     AggregatedError<3> rgbAggError;
4527                                     AggregatedError<1> alphaAggError;
4528 
4529                                     for (int px = 0; px < 16; px++)
4530                                     {
4531                                         MUInt15 rgbIndex = rgbIndexSelector.SelectIndexLDR(floatRotatedRGB[px], rtn);
4532                                         MUInt15 alphaIndex = alphaIndexSelector.SelectIndexLDR(floatPixels[px] + alphaChannel, rtn);
4533 
4534                                         MUInt15 reconstructedRGB[3];
4535                                         MUInt15 reconstructedAlpha[1];
4536 
4537                                         rgbIndexSelector.ReconstructLDR_BC7(rgbIndex, reconstructedRGB);
4538                                         alphaIndexSelector.ReconstructLDR_BC7(alphaIndex, reconstructedAlpha);
4539 
4540                                         if (flags & cvtt::Flags::BC7_FastIndexing)
4541                                         {
4542                                             BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], rgbAggError);
4543                                             BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, alphaAggError);
4544                                         }
4545                                         else
4546                                         {
4547                                             AggregatedError<3> baseRGBAggError;
4548                                             AggregatedError<1> baseAlphaAggError;
4549 
4550                                             BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], baseRGBAggError);
4551                                             BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, baseAlphaAggError);
4552 
4553                                             MFloat rgbError = baseRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
4554                                             MFloat alphaError = baseAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
4555 
4556                                             MUInt15 altRGBIndexes[2];
4557                                             MUInt15 altAlphaIndexes[2];
4558 
4559                                             altRGBIndexes[0] = ParallelMath::Max(rgbIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
4560                                             altRGBIndexes[1] = ParallelMath::Min(rgbIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << rgbPrec) - 1)));
4561 
4562                                             altAlphaIndexes[0] = ParallelMath::Max(alphaIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
4563                                             altAlphaIndexes[1] = ParallelMath::Min(alphaIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << alphaPrec) - 1)));
4564 
4565                                             for (int ii = 0; ii < 2; ii++)
4566                                             {
4567                                                 rgbIndexSelector.ReconstructLDR_BC7(altRGBIndexes[ii], reconstructedRGB);
4568                                                 alphaIndexSelector.ReconstructLDR_BC7(altAlphaIndexes[ii], reconstructedAlpha);
4569 
4570                                                 AggregatedError<3> altRGBAggError;
4571                                                 AggregatedError<1> altAlphaAggError;
4572 
4573                                                 BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], altRGBAggError);
4574                                                 BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, altAlphaAggError);
4575 
4576                                                 MFloat altRGBError = altRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
4577                                                 MFloat altAlphaError = altAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
4578 
4579                                                 ParallelMath::Int16CompFlag rgbBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altRGBError, rgbError));
4580                                                 ParallelMath::Int16CompFlag alphaBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altAlphaError, alphaError));
4581 
4582                                                 rgbError = ParallelMath::Min(altRGBError, rgbError);
4583                                                 alphaError = ParallelMath::Min(altAlphaError, alphaError);
4584 
4585                                                 ParallelMath::ConditionalSet(rgbIndex, rgbBetter, altRGBIndexes[ii]);
4586                                                 ParallelMath::ConditionalSet(alphaIndex, alphaBetter, altAlphaIndexes[ii]);
4587                                             }
4588 
4589                                             errorRGB = errorRGB + rgbError;
4590                                             errorA = errorA + alphaError;
4591                                         }
4592 
4593                                         if (refine != numRefineRounds - 1)
4594                                         {
4595                                             rgbRefiner.ContributeUnweightedPW(preWeightedRotatedRGB[px], rgbIndex);
4596                                             alphaRefiner.ContributeUnweightedPW(floatPixels[px] + alphaChannel, alphaIndex);
4597                                         }
4598 
4599                                         if (flags & Flags::BC7_FastIndexing)
4600                                         {
4601                                             errorRGB = rgbAggError.Finalize(flags, rotatedRGBWeightsSq);
4602                                             errorA = rgbAggError.Finalize(flags, rotatedAlphaWeightSq);
4603                                         }
4604 
4605                                         rgbIndexes[px] = rgbIndex;
4606                                         alphaIndexes[px] = alphaIndex;
4607                                     }
4608 
4609                                     ParallelMath::FloatCompFlag rgbBetter = ParallelMath::Less(errorRGB, bestRGBError);
4610                                     ParallelMath::FloatCompFlag alphaBetter = ParallelMath::Less(errorA, bestAlphaError);
4611 
4612                                     ParallelMath::Int16CompFlag rgbBetterInt16 = ParallelMath::FloatFlagToInt16(rgbBetter);
4613                                     ParallelMath::Int16CompFlag alphaBetterInt16 = ParallelMath::FloatFlagToInt16(alphaBetter);
4614 
4615                                     if (ParallelMath::AnySet(rgbBetterInt16))
4616                                     {
4617                                         bestRGBError = ParallelMath::Min(errorRGB, bestRGBError);
4618 
4619                                         for (int px = 0; px < 16; px++)
4620                                             ParallelMath::ConditionalSet(bestRGBIndexes[px], rgbBetterInt16, rgbIndexes[px]);
4621 
4622                                         for (int ep = 0; ep < 2; ep++)
4623                                         {
4624                                             for (int ch = 0; ch < 3; ch++)
4625                                                 ParallelMath::ConditionalSet(bestEP[ep][ch], rgbBetterInt16, rgbEP[ep][ch]);
4626                                         }
4627                                     }
4628 
4629                                     if (ParallelMath::AnySet(alphaBetterInt16))
4630                                     {
4631                                         bestAlphaError = ParallelMath::Min(errorA, bestAlphaError);
4632 
4633                                         for (int px = 0; px < 16; px++)
4634                                             ParallelMath::ConditionalSet(bestAlphaIndexes[px], alphaBetterInt16, alphaIndexes[px]);
4635 
4636                                         for (int ep = 0; ep < 2; ep++)
4637                                             ParallelMath::ConditionalSet(bestEP[ep][3], alphaBetterInt16, alphaEP[ep]);
4638                                     }
4639 
4640                                     if (refine != numRefineRounds - 1)
4641                                     {
4642                                         rgbRefiner.GetRefinedEndpointsLDR(rgbEP, rtn);
4643 
4644                                         MUInt15 alphaEPTemp[2][1];
4645                                         alphaRefiner.GetRefinedEndpointsLDR(alphaEPTemp, rtn);
4646 
4647                                         for (int i = 0; i < 2; i++)
4648                                             alphaEP[i] = alphaEPTemp[i][0];
4649                                     }
4650                                 }	// refine
4651                             } // tweak
4652 
4653                             MFloat combinedError = bestRGBError + bestAlphaError;
4654 
4655                             ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, work.m_error);
4656                             ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
4657 
4658                             work.m_error = ParallelMath::Min(combinedError, work.m_error);
4659 
4660                             ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
4661                             ParallelMath::ConditionalSet(work.m_u.m_isr.m_rotation, errorBetter16, ParallelMath::MakeUInt15(rotation));
4662                             ParallelMath::ConditionalSet(work.m_u.m_isr.m_indexSelector, errorBetter16, ParallelMath::MakeUInt15(indexSelector));
4663 
4664                             for (int px = 0; px < 16; px++)
4665                             {
4666                                 ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, indexSelector ? bestAlphaIndexes[px] : bestRGBIndexes[px]);
4667                                 ParallelMath::ConditionalSet(work.m_indexes2[px], errorBetter16, indexSelector ? bestRGBIndexes[px] : bestAlphaIndexes[px]);
4668                             }
4669 
4670                             for (int ep = 0; ep < 2; ep++)
4671                                 for (int ch = 0; ch < 4; ch++)
4672                                     ParallelMath::ConditionalSet(work.m_ep[0][ep][ch], errorBetter16, bestEP[ep][ch]);
4673                         }
4674                     }
4675                 }
4676             }
4677 
4678             template<class T>
Swap(T & a,T & b)4679             static void Swap(T& a, T& b)
4680             {
4681                 T temp = a;
4682                 a = b;
4683                 b = temp;
4684             }
4685 
Pack(uint32_t flags,const PixelBlockU8 * inputs,uint8_t * packedBlocks,const float channelWeights[4],int numTweakRounds,int numRefineRounds)4686             static void Pack(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, const float channelWeights[4], int numTweakRounds, int numRefineRounds)
4687             {
4688                 MUInt15 pixels[16][4];
4689                 MFloat floatPixels[16][4];
4690 
4691                 for (int px = 0; px < 16; px++)
4692                 {
4693                     for (int ch = 0; ch < 4; ch++)
4694                         ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
4695                 }
4696 
4697                 for (int px = 0; px < 16; px++)
4698                 {
4699                     for (int ch = 0; ch < 4; ch++)
4700                         floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
4701                 }
4702 
4703                 WorkInfo work;
4704                 memset(&work, 0, sizeof(work));
4705 
4706                 work.m_error = ParallelMath::MakeFloat(FLT_MAX);
4707 
4708                 {
4709                     ParallelMath::RoundTowardNearestForScope rtn;
4710                     TrySinglePlane(flags, pixels, floatPixels, channelWeights, numTweakRounds, numRefineRounds, work, &rtn);
4711                     TryDualPlane(flags, pixels, floatPixels, channelWeights, numTweakRounds, numRefineRounds, work, &rtn);
4712                 }
4713 
4714                 for (int block = 0; block < ParallelMath::ParallelSize; block++)
4715                 {
4716                     PackingVector pv;
4717                     pv.Init();
4718 
4719                     ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(work.m_mode, block);
4720                     ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(work.m_u.m_partition, block);
4721                     ParallelMath::ScalarUInt16 indexSelector = ParallelMath::Extract(work.m_u.m_isr.m_indexSelector, block);
4722 
4723                     const BC7Data::BC7ModeInfo& modeInfo = BC7Data::g_modes[mode];
4724 
4725                     ParallelMath::ScalarUInt16 indexes[16];
4726                     ParallelMath::ScalarUInt16 indexes2[16];
4727                     ParallelMath::ScalarUInt16 endPoints[3][2][4];
4728 
4729                     for (int i = 0; i < 16; i++)
4730                     {
4731                         indexes[i] = ParallelMath::Extract(work.m_indexes[i], block);
4732                         if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
4733                             indexes2[i] = ParallelMath::Extract(work.m_indexes2[i], block);
4734                     }
4735 
4736                     for (int subset = 0; subset < 3; subset++)
4737                     {
4738                         for (int ep = 0; ep < 2; ep++)
4739                         {
4740                             for (int ch = 0; ch < 4; ch++)
4741                                 endPoints[subset][ep][ch] = ParallelMath::Extract(work.m_ep[subset][ep][ch], block);
4742                         }
4743                     }
4744 
4745                     int fixups[3] = { 0, 0, 0 };
4746 
4747                     if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
4748                     {
4749                         bool flipRGB = ((indexes[0] & (1 << (modeInfo.m_indexBits - 1))) != 0);
4750                         bool flipAlpha = ((indexes2[0] & (1 << (modeInfo.m_alphaIndexBits - 1))) != 0);
4751 
4752                         if (flipRGB)
4753                         {
4754                             uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
4755                             for (int px = 0; px < 16; px++)
4756                                 indexes[px] = highIndex - indexes[px];
4757                         }
4758 
4759                         if (flipAlpha)
4760                         {
4761                             uint16_t highIndex = (1 << modeInfo.m_alphaIndexBits) - 1;
4762                             for (int px = 0; px < 16; px++)
4763                                 indexes2[px] = highIndex - indexes2[px];
4764                         }
4765 
4766                         if (indexSelector)
4767                             Swap(flipRGB, flipAlpha);
4768 
4769                         if (flipRGB)
4770                         {
4771                             for (int ch = 0; ch < 3; ch++)
4772                                 Swap(endPoints[0][0][ch], endPoints[0][1][ch]);
4773                         }
4774                         if (flipAlpha)
4775                             Swap(endPoints[0][0][3], endPoints[0][1][3]);
4776 
4777                     }
4778                     else
4779                     {
4780                         if (modeInfo.m_numSubsets == 2)
4781                             fixups[1] = BC7Data::g_fixupIndexes2[partition];
4782                         else if (modeInfo.m_numSubsets == 3)
4783                         {
4784                             fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
4785                             fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
4786                         }
4787 
4788                         bool flip[3] = { false, false, false };
4789                         for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
4790                             flip[subset] = ((indexes[fixups[subset]] & (1 << (modeInfo.m_indexBits - 1))) != 0);
4791 
4792                         if (flip[0] || flip[1] || flip[2])
4793                         {
4794                             uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
4795                             for (int px = 0; px < 16; px++)
4796                             {
4797                                 int subset = 0;
4798                                 if (modeInfo.m_numSubsets == 2)
4799                                     subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
4800                                 else if (modeInfo.m_numSubsets == 3)
4801                                     subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
4802 
4803                                 if (flip[subset])
4804                                     indexes[px] = highIndex - indexes[px];
4805                             }
4806 
4807                             int maxCH = (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined) ? 4 : 3;
4808                             for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
4809                             {
4810                                 if (flip[subset])
4811                                     for (int ch = 0; ch < maxCH; ch++)
4812                                         Swap(endPoints[subset][0][ch], endPoints[subset][1][ch]);
4813                             }
4814                         }
4815                     }
4816 
4817                     pv.Pack(static_cast<uint8_t>(1 << mode), mode + 1);
4818 
4819                     if (modeInfo.m_partitionBits)
4820                         pv.Pack(partition, modeInfo.m_partitionBits);
4821 
4822                     if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
4823                     {
4824                         ParallelMath::ScalarUInt16 rotation = ParallelMath::Extract(work.m_u.m_isr.m_rotation, block);
4825                         pv.Pack(rotation, 2);
4826                     }
4827 
4828                     if (modeInfo.m_hasIndexSelector)
4829                         pv.Pack(indexSelector, 1);
4830 
4831                     // Encode RGB
4832                     for (int ch = 0; ch < 3; ch++)
4833                     {
4834                         for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
4835                         {
4836                             for (int ep = 0; ep < 2; ep++)
4837                             {
4838                                 ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][ch];
4839                                 epPart >>= (8 - modeInfo.m_rgbBits);
4840 
4841                                 pv.Pack(epPart, modeInfo.m_rgbBits);
4842                             }
4843                         }
4844                     }
4845 
4846                     // Encode alpha
4847                     if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
4848                     {
4849                         for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
4850                         {
4851                             for (int ep = 0; ep < 2; ep++)
4852                             {
4853                                 ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][3];
4854                                 epPart >>= (8 - modeInfo.m_alphaBits);
4855 
4856                                 pv.Pack(epPart, modeInfo.m_alphaBits);
4857                             }
4858                         }
4859                     }
4860 
4861                     // Encode parity bits
4862                     if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
4863                     {
4864                         for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
4865                         {
4866                             ParallelMath::ScalarUInt16 epPart = endPoints[subset][0][0];
4867                             epPart >>= (7 - modeInfo.m_rgbBits);
4868                             epPart &= 1;
4869 
4870                             pv.Pack(epPart, 1);
4871                         }
4872                     }
4873                     else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
4874                     {
4875                         for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
4876                         {
4877                             for (int ep = 0; ep < 2; ep++)
4878                             {
4879                                 ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][0];
4880                                 epPart >>= (7 - modeInfo.m_rgbBits);
4881                                 epPart &= 1;
4882 
4883                                 pv.Pack(epPart, 1);
4884                             }
4885                         }
4886                     }
4887 
4888                     // Encode indexes
4889                     for (int px = 0; px < 16; px++)
4890                     {
4891                         int bits = modeInfo.m_indexBits;
4892                         if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
4893                             bits--;
4894 
4895                         pv.Pack(indexes[px], bits);
4896                     }
4897 
4898                     // Encode secondary indexes
4899                     if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
4900                     {
4901                         for (int px = 0; px < 16; px++)
4902                         {
4903                             int bits = modeInfo.m_alphaIndexBits;
4904                             if (px == 0)
4905                                 bits--;
4906 
4907                             pv.Pack(indexes2[px], bits);
4908                         }
4909                     }
4910 
4911                     pv.Flush(packedBlocks);
4912 
4913                     packedBlocks += 16;
4914                 }
4915             }
4916 
UnpackOne(PixelBlockU8 & output,const uint8_t * packedBlock)4917             static void UnpackOne(PixelBlockU8 &output, const uint8_t* packedBlock)
4918             {
4919                 UnpackingVector pv;
4920                 pv.Init(packedBlock);
4921 
4922                 int mode = 8;
4923                 for (int i = 0; i < 8; i++)
4924                 {
4925                     if (pv.Unpack(1) == 1)
4926                     {
4927                         mode = i;
4928                         break;
4929                     }
4930                 }
4931 
4932                 if (mode > 7)
4933                 {
4934                     for (int px = 0; px < 16; px++)
4935                         for (int ch = 0; ch < 4; ch++)
4936                             output.m_pixels[px][ch] = 0;
4937 
4938                     return;
4939                 }
4940 
4941                 const BC7Data::BC7ModeInfo &modeInfo = BC7Data::g_modes[mode];
4942 
4943                 int partition = 0;
4944                 if (modeInfo.m_partitionBits)
4945                     partition = pv.Unpack(modeInfo.m_partitionBits);
4946 
4947                 int rotation = 0;
4948                 if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
4949                     rotation = pv.Unpack(2);
4950 
4951                 int indexSelector = 0;
4952                 if (modeInfo.m_hasIndexSelector)
4953                     indexSelector = pv.Unpack(1);
4954 
4955                 // Resolve fixups
4956                 int fixups[3] = { 0, 0, 0 };
4957 
4958                 if (modeInfo.m_alphaMode != BC7Data::AlphaMode_Separate)
4959                 {
4960                     if (modeInfo.m_numSubsets == 2)
4961                         fixups[1] = BC7Data::g_fixupIndexes2[partition];
4962                     else if (modeInfo.m_numSubsets == 3)
4963                     {
4964                         fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
4965                         fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
4966                     }
4967                 }
4968 
4969                 int endPoints[3][2][4];
4970 
4971                 // Decode RGB
4972                 for (int ch = 0; ch < 3; ch++)
4973                 {
4974                     for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
4975                     {
4976                         for (int ep = 0; ep < 2; ep++)
4977                             endPoints[subset][ep][ch] = (pv.Unpack(modeInfo.m_rgbBits) << (8 - modeInfo.m_rgbBits));
4978                     }
4979                 }
4980 
4981                 // Decode alpha
4982                 if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
4983                 {
4984                     for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
4985                     {
4986                         for (int ep = 0; ep < 2; ep++)
4987                             endPoints[subset][ep][3] = (pv.Unpack(modeInfo.m_alphaBits) << (8 - modeInfo.m_alphaBits));
4988                     }
4989                 }
4990                 else
4991                 {
4992                     for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
4993                     {
4994                         for (int ep = 0; ep < 2; ep++)
4995                             endPoints[subset][ep][3] = 255;
4996                     }
4997                 }
4998 
4999                 int parityBits = 0;
5000 
5001                 // Decode parity bits
5002                 if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
5003                 {
5004                     for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
5005                     {
5006                         int p = pv.Unpack(1);
5007 
5008                         for (int ep = 0; ep < 2; ep++)
5009                         {
5010                             for (int ch = 0; ch < 3; ch++)
5011                                 endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
5012 
5013                             if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
5014                                 endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
5015                         }
5016                     }
5017 
5018                     parityBits = 1;
5019                 }
5020                 else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
5021                 {
5022                     for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
5023                     {
5024                         for (int ep = 0; ep < 2; ep++)
5025                         {
5026                             int p = pv.Unpack(1);
5027 
5028                             for (int ch = 0; ch < 3; ch++)
5029                                 endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
5030 
5031                             if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
5032                                 endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
5033                         }
5034                     }
5035 
5036                     parityBits = 1;
5037                 }
5038 
5039                 // Fill endpoint bits
5040                 for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
5041                 {
5042                     for (int ep = 0; ep < 2; ep++)
5043                     {
5044                         for (int ch = 0; ch < 3; ch++)
5045                             endPoints[subset][ep][ch] |= (endPoints[subset][ep][ch] >> (modeInfo.m_rgbBits + parityBits));
5046 
5047                         if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
5048                             endPoints[subset][ep][3] |= (endPoints[subset][ep][3] >> (modeInfo.m_alphaBits + parityBits));
5049                     }
5050                 }
5051 
5052                 int indexes[16];
5053                 int indexes2[16];
5054 
5055                 // Decode indexes
5056                 for (int px = 0; px < 16; px++)
5057                 {
5058                     int bits = modeInfo.m_indexBits;
5059                     if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
5060                         bits--;
5061 
5062                     indexes[px] = pv.Unpack(bits);
5063                 }
5064 
5065                 // Decode secondary indexes
5066                 if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
5067                 {
5068                     for (int px = 0; px < 16; px++)
5069                     {
5070                         int bits = modeInfo.m_alphaIndexBits;
5071                         if (px == 0)
5072                             bits--;
5073 
5074                         indexes2[px] = pv.Unpack(bits);
5075                     }
5076                 }
5077                 else
5078                 {
5079                     for (int px = 0; px < 16; px++)
5080                         indexes2[px] = 0;
5081                 }
5082 
5083                 const int *alphaWeights = BC7Data::g_weightTables[modeInfo.m_alphaIndexBits];
5084                 const int *rgbWeights = BC7Data::g_weightTables[modeInfo.m_indexBits];
5085 
5086                 // Decode each pixel
5087                 for (int px = 0; px < 16; px++)
5088                 {
5089                     int rgbWeight = 0;
5090                     int alphaWeight = 0;
5091 
5092                     int rgbIndex = indexes[px];
5093 
5094                     rgbWeight = rgbWeights[indexes[px]];
5095 
5096                     if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined)
5097                         alphaWeight = rgbWeight;
5098                     else if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
5099                         alphaWeight = alphaWeights[indexes2[px]];
5100 
5101                     if (indexSelector == 1)
5102                     {
5103                         int temp = rgbWeight;
5104                         rgbWeight = alphaWeight;
5105                         alphaWeight = temp;
5106                     }
5107 
5108                     int pixel[4] = { 0, 0, 0, 255 };
5109 
5110                     int subset = 0;
5111 
5112                     if (modeInfo.m_numSubsets == 2)
5113                         subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
5114                     else if (modeInfo.m_numSubsets == 3)
5115                         subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
5116 
5117                     for (int ch = 0; ch < 3; ch++)
5118                         pixel[ch] = ((64 - rgbWeight) * endPoints[subset][0][ch] + rgbWeight * endPoints[subset][1][ch] + 32) >> 6;
5119 
5120                     if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
5121                         pixel[3] = ((64 - alphaWeight) * endPoints[subset][0][3] + alphaWeight * endPoints[subset][1][3] + 32) >> 6;
5122 
5123                     if (rotation != 0)
5124                     {
5125                         int ch = rotation - 1;
5126                         int temp = pixel[ch];
5127                         pixel[ch] = pixel[3];
5128                         pixel[3] = temp;
5129                     }
5130 
5131                     for (int ch = 0; ch < 4; ch++)
5132                         output.m_pixels[px][ch] = static_cast<uint8_t>(pixel[ch]);
5133                 }
5134             }
5135         };
5136 
5137         class BC6HComputer
5138         {
5139         public:
5140             typedef ParallelMath::Float MFloat;
5141             typedef ParallelMath::SInt16 MSInt16;
5142             typedef ParallelMath::UInt16 MUInt16;
5143             typedef ParallelMath::UInt15 MUInt15;
5144             typedef ParallelMath::AInt16 MAInt16;
5145             typedef ParallelMath::SInt32 MSInt32;
5146             typedef ParallelMath::UInt31 MUInt31;
5147 
5148             static const int MaxTweakRounds = 4;
5149             static const int MaxRefineRounds = 3;
5150 
QuantizeSingleEndpointElementSigned(const MSInt16 & elem2CL,int precision,const ParallelMath::RoundUpForScope * ru)5151             static MSInt16 QuantizeSingleEndpointElementSigned(const MSInt16 &elem2CL, int precision, const ParallelMath::RoundUpForScope* ru)
5152             {
5153                 assert(ParallelMath::AllSet(ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(31744))));
5154                 assert(ParallelMath::AllSet(ParallelMath::Less(ParallelMath::MakeSInt16(-31744), elem2CL)));
5155 
5156                 // Expand to full range
5157                 ParallelMath::Int16CompFlag isNegative = ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(0));
5158                 MUInt15 absElem = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - elem2CL, elem2CL));
5159 
5160                 absElem = ParallelMath::RightShift(ParallelMath::RoundAndConvertToU15(ParallelMath::ToFloat(absElem) * 32.0f / 31.0f, ru), 16 - precision);
5161 
5162                 MSInt16 absElemS16 = ParallelMath::LosslessCast<MSInt16>::Cast(absElem);
5163 
5164                 return ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - absElemS16, absElemS16);
5165             }
5166 
QuantizeSingleEndpointElementUnsigned(const MUInt15 & elem,int precision,const ParallelMath::RoundUpForScope * ru)5167             static MUInt15 QuantizeSingleEndpointElementUnsigned(const MUInt15 &elem, int precision, const ParallelMath::RoundUpForScope* ru)
5168             {
5169                 MUInt16 expandedElem = ParallelMath::RoundAndConvertToU16(ParallelMath::Min(ParallelMath::ToFloat(elem) * 64.0f / 31.0f, ParallelMath::MakeFloat(65535.0f)), ru);
5170                 return ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(expandedElem, 16 - precision));
5171             }
5172 
UnquantizeSingleEndpointElementSigned(const MSInt16 & comp,int precision,MSInt16 & outUnquantized,MSInt16 & outUnquantizedFinished2CL)5173             static void UnquantizeSingleEndpointElementSigned(const MSInt16 &comp, int precision, MSInt16 &outUnquantized, MSInt16 &outUnquantizedFinished2CL)
5174             {
5175                 MSInt16 zero = ParallelMath::MakeSInt16(0);
5176 
5177                 ParallelMath::Int16CompFlag negative = ParallelMath::Less(comp, zero);
5178                 MUInt15 absComp = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(negative, MSInt16(zero - comp), comp));
5179 
5180                 MSInt16 unq;
5181                 MUInt15 absUnq;
5182 
5183                 if (precision >= 16)
5184                 {
5185                     unq = comp;
5186                     absUnq = absComp;
5187                 }
5188                 else
5189                 {
5190                     MSInt16 maxCompMinusOne = ParallelMath::MakeSInt16(static_cast<int16_t>((1 << (precision - 1)) - 2));
5191                     ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
5192                     ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
5193 
5194                     absUnq = (absComp << (16 - precision)) + ParallelMath::MakeUInt15(static_cast<uint16_t>(0x4000 >> (precision - 1)));
5195                     ParallelMath::ConditionalSet(absUnq, isZero, ParallelMath::MakeUInt15(0));
5196                     ParallelMath::ConditionalSet(absUnq, isMax, ParallelMath::MakeUInt15(0x7fff));
5197 
5198                     unq = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(absUnq));
5199                 }
5200 
5201                 outUnquantized = unq;
5202 
5203                 MUInt15 funq = ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(absUnq, ParallelMath::MakeUInt15(31)), 5));
5204 
5205                 outUnquantizedFinished2CL = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(funq));
5206             }
5207 
UnquantizeSingleEndpointElementUnsigned(const MUInt15 & comp,int precision,MUInt16 & outUnquantized,MUInt16 & outUnquantizedFinished)5208             static void UnquantizeSingleEndpointElementUnsigned(const MUInt15 &comp, int precision, MUInt16 &outUnquantized, MUInt16 &outUnquantizedFinished)
5209             {
5210                 MUInt16 unq = ParallelMath::LosslessCast<MUInt16>::Cast(comp);
5211                 if (precision < 15)
5212                 {
5213                     MUInt15 zero = ParallelMath::MakeUInt15(0);
5214                     MUInt15 maxCompMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << precision) - 2));
5215 
5216                     ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
5217                     ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
5218 
5219                     unq = (ParallelMath::LosslessCast<MUInt16>::Cast(comp) << (16 - precision)) + ParallelMath::MakeUInt16(static_cast<uint16_t>(0x8000 >> precision));
5220 
5221                     ParallelMath::ConditionalSet(unq, isZero, ParallelMath::MakeUInt16(0));
5222                     ParallelMath::ConditionalSet(unq, isMax, ParallelMath::MakeUInt16(0xffff));
5223                 }
5224 
5225                 outUnquantized = unq;
5226                 outUnquantizedFinished = ParallelMath::ToUInt16(ParallelMath::RightShift(ParallelMath::XMultiply(unq, ParallelMath::MakeUInt15(31)), 6));
5227             }
5228 
QuantizeEndpointsSigned(const MSInt16 endPoints[2][3],const MFloat floatPixelsColorSpace[16][3],const MFloat floatPixelsLinearWeighted[16][3],MAInt16 quantizedEndPoints[2][3],MUInt15 indexes[16],IndexSelectorHDR<3> & indexSelector,int fixupIndex,int precision,int indexRange,const float * channelWeights,bool fastIndexing,const ParallelMath::RoundTowardNearestForScope * rtn)5229             static void QuantizeEndpointsSigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
5230             {
5231                 MSInt16 unquantizedEP[2][3];
5232                 MSInt16 finishedUnquantizedEP[2][3];
5233 
5234                 {
5235                     ParallelMath::RoundUpForScope ru;
5236 
5237                     for (int epi = 0; epi < 2; epi++)
5238                     {
5239                         for (int ch = 0; ch < 3; ch++)
5240                         {
5241                             MSInt16 qee = QuantizeSingleEndpointElementSigned(endPoints[epi][ch], precision, &ru);
5242                             UnquantizeSingleEndpointElementSigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
5243                             quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
5244                         }
5245                     }
5246                 }
5247 
5248                 indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
5249                 indexSelector.InitHDR(indexRange, true, fastIndexing, channelWeights);
5250 
5251                 MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
5252 
5253                 MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
5254 
5255                 ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
5256 
5257                 if (ParallelMath::AnySet(invert))
5258                 {
5259                     ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
5260 
5261                     indexSelector.ConditionalInvert(invert);
5262 
5263                     for (int ch = 0; ch < 3; ch++)
5264                     {
5265                         MAInt16 firstEP = quantizedEndPoints[0][ch];
5266                         MAInt16 secondEP = quantizedEndPoints[1][ch];
5267 
5268                         quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
5269                         quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
5270                     }
5271                 }
5272 
5273                 indexes[fixupIndex] = index;
5274             }
5275 
QuantizeEndpointsUnsigned(const MSInt16 endPoints[2][3],const MFloat floatPixelsColorSpace[16][3],const MFloat floatPixelsLinearWeighted[16][3],MAInt16 quantizedEndPoints[2][3],MUInt15 indexes[16],IndexSelectorHDR<3> & indexSelector,int fixupIndex,int precision,int indexRange,const float * channelWeights,bool fastIndexing,const ParallelMath::RoundTowardNearestForScope * rtn)5276             static void QuantizeEndpointsUnsigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
5277             {
5278                 MUInt16 unquantizedEP[2][3];
5279                 MUInt16 finishedUnquantizedEP[2][3];
5280 
5281                 {
5282                     ParallelMath::RoundUpForScope ru;
5283 
5284                     for (int epi = 0; epi < 2; epi++)
5285                     {
5286                         for (int ch = 0; ch < 3; ch++)
5287                         {
5288                             MUInt15 qee = QuantizeSingleEndpointElementUnsigned(ParallelMath::LosslessCast<MUInt15>::Cast(endPoints[epi][ch]), precision, &ru);
5289                             UnquantizeSingleEndpointElementUnsigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
5290                             quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
5291                         }
5292                     }
5293                 }
5294 
5295                 indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
5296                 indexSelector.InitHDR(indexRange, false, fastIndexing, channelWeights);
5297 
5298                 MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
5299 
5300                 MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
5301 
5302                 ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
5303 
5304                 if (ParallelMath::AnySet(invert))
5305                 {
5306                     ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
5307 
5308                     indexSelector.ConditionalInvert(invert);
5309 
5310                     for (int ch = 0; ch < 3; ch++)
5311                     {
5312                         MAInt16 firstEP = quantizedEndPoints[0][ch];
5313                         MAInt16 secondEP = quantizedEndPoints[1][ch];
5314 
5315                         quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
5316                         quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
5317                     }
5318                 }
5319 
5320                 indexes[fixupIndex] = index;
5321             }
5322 
EvaluatePartitionedLegality(const MAInt16 ep0[2][3],const MAInt16 ep1[2][3],int aPrec,const int bPrec[3],bool isTransformed,MAInt16 outEncodedEPs[2][2][3],ParallelMath::Int16CompFlag & outIsLegal)5323             static void EvaluatePartitionedLegality(const MAInt16 ep0[2][3], const MAInt16 ep1[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][2][3], ParallelMath::Int16CompFlag& outIsLegal)
5324             {
5325                 ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
5326 
5327                 MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
5328 
5329                 for (int ch = 0; ch < 3; ch++)
5330                 {
5331                     outEncodedEPs[0][0][ch] = ep0[0][ch];
5332                     outEncodedEPs[0][1][ch] = ep0[1][ch];
5333                     outEncodedEPs[1][0][ch] = ep1[0][ch];
5334                     outEncodedEPs[1][1][ch] = ep1[1][ch];
5335 
5336                     if (isTransformed)
5337                     {
5338                         for (int subset = 0; subset < 2; subset++)
5339                         {
5340                             for (int epi = 0; epi < 2; epi++)
5341                             {
5342                                 if (epi == 0 && subset == 0)
5343                                     continue;
5344 
5345                                 MAInt16 bReduced = (outEncodedEPs[subset][epi][ch] & aSignificantMask);
5346 
5347                                 MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch])), bPrec[ch]);
5348 
5349                                 outEncodedEPs[subset][epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
5350 
5351                                 MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch]) & aSignificantMask);
5352                                 allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
5353                             }
5354                         }
5355                     }
5356 
5357                     if (!ParallelMath::AnySet(allLegal))
5358                         break;
5359                 }
5360 
5361                 outIsLegal = allLegal;
5362             }
5363 
EvaluateSingleLegality(const MAInt16 ep[2][3],int aPrec,const int bPrec[3],bool isTransformed,MAInt16 outEncodedEPs[2][3],ParallelMath::Int16CompFlag & outIsLegal)5364             static void EvaluateSingleLegality(const MAInt16 ep[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][3], ParallelMath::Int16CompFlag& outIsLegal)
5365             {
5366                 ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
5367 
5368                 MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
5369 
5370                 for (int ch = 0; ch < 3; ch++)
5371                 {
5372                     outEncodedEPs[0][ch] = ep[0][ch];
5373                     outEncodedEPs[1][ch] = ep[1][ch];
5374 
5375                     if (isTransformed)
5376                     {
5377                         MAInt16 bReduced = (outEncodedEPs[1][ch] & aSignificantMask);
5378 
5379                         MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[1][ch], outEncodedEPs[0][ch])), bPrec[ch]);
5380 
5381                         outEncodedEPs[1][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
5382 
5383                         MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[1][ch], outEncodedEPs[0][ch]) & aSignificantMask);
5384                         allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
5385                     }
5386                 }
5387 
5388                 outIsLegal = allLegal;
5389             }
5390 
Pack(uint32_t flags,const PixelBlockF16 * inputs,uint8_t * packedBlocks,const float channelWeights[4],bool isSigned,int numTweakRounds,int numRefineRounds)5391             static void Pack(uint32_t flags, const PixelBlockF16* inputs, uint8_t* packedBlocks, const float channelWeights[4], bool isSigned, int numTweakRounds, int numRefineRounds)
5392             {
5393                 if (numTweakRounds < 1)
5394                     numTweakRounds = 1;
5395                 else if (numTweakRounds > MaxTweakRounds)
5396                     numTweakRounds = MaxTweakRounds;
5397 
5398                 if (numRefineRounds < 1)
5399                     numRefineRounds = 1;
5400                 else if (numRefineRounds > MaxRefineRounds)
5401                     numRefineRounds = MaxRefineRounds;
5402 
5403                 bool fastIndexing = ((flags & cvtt::Flags::BC6H_FastIndexing) != 0);
5404                 float channelWeightsSq[3];
5405 
5406                 ParallelMath::RoundTowardNearestForScope rtn;
5407 
5408                 MSInt16 pixels[16][3];
5409                 MFloat floatPixels2CL[16][3];
5410                 MFloat floatPixelsLinearWeighted[16][3];
5411 
5412                 MSInt16 low15Bits = ParallelMath::MakeSInt16(32767);
5413 
5414                 for (int ch = 0; ch < 3; ch++)
5415                     channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
5416 
5417                 for (int px = 0; px < 16; px++)
5418                 {
5419                     for (int ch = 0; ch < 3; ch++)
5420                     {
5421                         MSInt16 pixelValue;
5422                         ParallelMath::ConvertHDRInputs(inputs, px, ch, pixelValue);
5423 
5424                         // Convert from sign+magnitude to 2CL
5425                         if (isSigned)
5426                         {
5427                             ParallelMath::Int16CompFlag negative = ParallelMath::Less(pixelValue, ParallelMath::MakeSInt16(0));
5428                             MSInt16 magnitude = (pixelValue & low15Bits);
5429                             ParallelMath::ConditionalSet(pixelValue, negative, ParallelMath::MakeSInt16(0) - magnitude);
5430                             pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(-31743));
5431                         }
5432                         else
5433                             pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(0));
5434 
5435                         pixelValue = ParallelMath::Min(pixelValue, ParallelMath::MakeSInt16(31743));
5436 
5437                         pixels[px][ch] = pixelValue;
5438                         floatPixels2CL[px][ch] = ParallelMath::ToFloat(pixelValue);
5439                         floatPixelsLinearWeighted[px][ch] = ParallelMath::TwosCLHalfToFloat(pixelValue) * channelWeights[ch];
5440                     }
5441                 }
5442 
5443                 MFloat preWeightedPixels[16][3];
5444 
5445                 BCCommon::PreWeightPixelsHDR<3>(preWeightedPixels, pixels, channelWeights);
5446 
5447                 MAInt16 bestEndPoints[2][2][3];
5448                 MUInt15 bestIndexes[16];
5449                 MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
5450                 MUInt15 bestMode = ParallelMath::MakeUInt15(0);
5451                 MUInt15 bestPartition = ParallelMath::MakeUInt15(0);
5452 
5453                 for (int px = 0; px < 16; px++)
5454                     bestIndexes[px] = ParallelMath::MakeUInt15(0);
5455 
5456                 for (int subset = 0; subset < 2; subset++)
5457                     for (int epi = 0; epi < 2; epi++)
5458                         for (int ch = 0; ch < 3; ch++)
5459                             bestEndPoints[subset][epi][ch] = ParallelMath::MakeAInt16(0);
5460 
5461                 UnfinishedEndpoints<3> partitionedUFEP[32][2];
5462                 UnfinishedEndpoints<3> singleUFEP;
5463 
5464                 // Generate UFEP for partitions
5465                 for (int p = 0; p < 32; p++)
5466                 {
5467                     int partitionMask = BC7Data::g_partitionMap[p];
5468 
5469                     EndpointSelector<3, 8> epSelectors[2];
5470 
5471                     for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
5472                     {
5473                         for (int px = 0; px < 16; px++)
5474                         {
5475                             int subset = (partitionMask >> px) & 1;
5476                             epSelectors[subset].ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
5477                         }
5478 
5479                         for (int subset = 0; subset < 2; subset++)
5480                             epSelectors[subset].FinishPass(pass);
5481                     }
5482 
5483                     for (int subset = 0; subset < 2; subset++)
5484                         partitionedUFEP[p][subset] = epSelectors[subset].GetEndpoints(channelWeights);
5485                 }
5486 
5487                 // Generate UFEP for single
5488                 {
5489                     EndpointSelector<3, 8> epSelector;
5490 
5491                     for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
5492                     {
5493                         for (int px = 0; px < 16; px++)
5494                             epSelector.ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
5495 
5496                         epSelector.FinishPass(pass);
5497                     }
5498 
5499                     singleUFEP = epSelector.GetEndpoints(channelWeights);
5500                 }
5501 
5502                 for (int partitionedInt = 0; partitionedInt < 2; partitionedInt++)
5503                 {
5504                     bool partitioned = (partitionedInt == 1);
5505 
5506                     for (int aPrec = BC7Data::g_maxHDRPrecision; aPrec >= 0; aPrec--)
5507                     {
5508                         if (!BC7Data::g_hdrModesExistForPrecision[partitionedInt][aPrec])
5509                             continue;
5510 
5511                         int numPartitions = partitioned ? 32 : 1;
5512                         int numSubsets = partitioned ? 2 : 1;
5513                         int indexBits = partitioned ? 3 : 4;
5514                         int indexRange = (1 << indexBits);
5515 
5516                         for (int p = 0; p < numPartitions; p++)
5517                         {
5518                             int partitionMask = partitioned ? BC7Data::g_partitionMap[p] : 0;
5519 
5520                             const int MaxMetaRounds = MaxTweakRounds * MaxRefineRounds;
5521 
5522                             MAInt16 metaEndPointsQuantized[MaxMetaRounds][2][2][3];
5523                             MUInt15 metaIndexes[MaxMetaRounds][16];
5524                             MFloat metaError[MaxMetaRounds][2];
5525 
5526                             bool roundValid[MaxMetaRounds][2];
5527 
5528                             for (int r = 0; r < MaxMetaRounds; r++)
5529                                 for (int subset = 0; subset < 2; subset++)
5530                                     roundValid[r][subset] = true;
5531 
5532                             for (int subset = 0; subset < numSubsets; subset++)
5533                             {
5534                                 for (int tweak = 0; tweak < MaxTweakRounds; tweak++)
5535                                 {
5536                                     EndpointRefiner<3> refiners[2];
5537 
5538                                     bool abortRemainingRefines = false;
5539                                     for (int refinePass = 0; refinePass < MaxRefineRounds; refinePass++)
5540                                     {
5541                                         int metaRound = tweak * MaxRefineRounds + refinePass;
5542 
5543                                         if (tweak >= numTweakRounds || refinePass >= numRefineRounds)
5544                                             abortRemainingRefines = true;
5545 
5546                                         if (abortRemainingRefines)
5547                                         {
5548                                             roundValid[metaRound][subset] = false;
5549                                             continue;
5550                                         }
5551 
5552                                         MAInt16(&mrQuantizedEndPoints)[2][2][3] = metaEndPointsQuantized[metaRound];
5553                                         MUInt15(&mrIndexes)[16] = metaIndexes[metaRound];
5554 
5555                                         MSInt16 endPointsColorSpace[2][3];
5556 
5557                                         if (refinePass == 0)
5558                                         {
5559                                             UnfinishedEndpoints<3> ufep = partitioned ? partitionedUFEP[p][subset] : singleUFEP;
5560 
5561                                             if (isSigned)
5562                                                 ufep.FinishHDRSigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
5563                                             else
5564                                                 ufep.FinishHDRUnsigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
5565                                         }
5566                                         else
5567                                             refiners[subset].GetRefinedEndpointsHDR(endPointsColorSpace, isSigned, &rtn);
5568 
5569                                         refiners[subset].Init(indexRange, channelWeights);
5570 
5571                                         int fixupIndex = (subset == 0) ? 0 : BC7Data::g_fixupIndexes2[p];
5572 
5573                                         IndexSelectorHDR<3> indexSelector;
5574                                         if (isSigned)
5575                                             QuantizeEndpointsSigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
5576                                         else
5577                                             QuantizeEndpointsUnsigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
5578 
5579                                         if (metaRound > 0)
5580                                         {
5581                                             ParallelMath::Int16CompFlag anySame = ParallelMath::MakeBoolInt16(false);
5582 
5583                                             for (int prevRound = 0; prevRound < metaRound; prevRound++)
5584                                             {
5585                                                 MAInt16(&prevRoundEPs)[2][3] = metaEndPointsQuantized[prevRound][subset];
5586 
5587                                                 ParallelMath::Int16CompFlag same = ParallelMath::MakeBoolInt16(true);
5588 
5589                                                 for (int epi = 0; epi < 2; epi++)
5590                                                     for (int ch = 0; ch < 3; ch++)
5591                                                         same = (same & ParallelMath::Equal(prevRoundEPs[epi][ch], mrQuantizedEndPoints[subset][epi][ch]));
5592 
5593                                                 anySame = (anySame | same);
5594                                                 if (ParallelMath::AllSet(anySame))
5595                                                     break;
5596                                             }
5597 
5598                                             if (ParallelMath::AllSet(anySame))
5599                                             {
5600                                                 roundValid[metaRound][subset] = false;
5601                                                 continue;
5602                                             }
5603                                         }
5604 
5605                                         MFloat subsetError = ParallelMath::MakeFloatZero();
5606 
5607                                         {
5608                                             for (int px = 0; px < 16; px++)
5609                                             {
5610                                                 if (subset != ((partitionMask >> px) & 1))
5611                                                     continue;
5612 
5613                                                 MUInt15 index;
5614                                                 if (px == fixupIndex)
5615                                                     index = mrIndexes[px];
5616                                                 else
5617                                                 {
5618                                                     index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixels2CL[px], &rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[px], &rtn);
5619                                                     mrIndexes[px] = index;
5620                                                 }
5621 
5622                                                 MSInt16 reconstructed[3];
5623                                                 if (isSigned)
5624                                                     indexSelector.ReconstructHDRSigned(mrIndexes[px], reconstructed);
5625                                                 else
5626                                                     indexSelector.ReconstructHDRUnsigned(mrIndexes[px], reconstructed);
5627 
5628                                                 subsetError = subsetError + (fastIndexing ? BCCommon::ComputeErrorHDRFast<3>(flags, reconstructed, pixels[px], channelWeightsSq) : BCCommon::ComputeErrorHDRSlow<3>(flags, reconstructed, pixels[px], channelWeightsSq));
5629 
5630                                                 if (refinePass != numRefineRounds - 1)
5631                                                     refiners[subset].ContributeUnweightedPW(preWeightedPixels[px], index);
5632                                             }
5633                                         }
5634 
5635                                         metaError[metaRound][subset] = subsetError;
5636                                     }
5637                                 }
5638                             }
5639 
5640                             // Now we have a bunch of attempts, but not all of them will fit in the delta coding scheme
5641                             int numMeta1 = partitioned ? MaxMetaRounds : 1;
5642                             for (int meta0 = 0; meta0 < MaxMetaRounds; meta0++)
5643                             {
5644                                 if (!roundValid[meta0][0])
5645                                     continue;
5646 
5647                                 for (int meta1 = 0; meta1 < numMeta1; meta1++)
5648                                 {
5649                                     MFloat combinedError = metaError[meta0][0];
5650                                     if (partitioned)
5651                                     {
5652                                         if (!roundValid[meta1][1])
5653                                             continue;
5654 
5655                                         combinedError = combinedError + metaError[meta1][1];
5656                                     }
5657 
5658                                     ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, bestError);
5659                                     if (!ParallelMath::AnySet(errorBetter))
5660                                         continue;
5661 
5662                                     ParallelMath::Int16CompFlag needsCommit = ParallelMath::FloatFlagToInt16(errorBetter);
5663 
5664                                     // Figure out if this is encodable
5665                                     for (int mode = 0; mode < BC7Data::g_numHDRModes; mode++)
5666                                     {
5667                                         const BC7Data::BC6HModeInfo &modeInfo = BC7Data::g_hdrModes[mode];
5668 
5669                                         if (modeInfo.m_partitioned != partitioned || modeInfo.m_aPrec != aPrec)
5670                                             continue;
5671 
5672                                         MAInt16 encodedEPs[2][2][3];
5673                                         ParallelMath::Int16CompFlag isLegal;
5674                                         if (partitioned)
5675                                             EvaluatePartitionedLegality(metaEndPointsQuantized[meta0][0], metaEndPointsQuantized[meta1][1], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs, isLegal);
5676                                         else
5677                                             EvaluateSingleLegality(metaEndPointsQuantized[meta0][0], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs[0], isLegal);
5678 
5679                                         ParallelMath::Int16CompFlag isLegalAndBetter = (ParallelMath::FloatFlagToInt16(errorBetter) & isLegal);
5680                                         if (!ParallelMath::AnySet(isLegalAndBetter))
5681                                             continue;
5682 
5683                                         ParallelMath::FloatCompFlag isLegalAndBetterFloat = ParallelMath::Int16FlagToFloat(isLegalAndBetter);
5684 
5685                                         ParallelMath::ConditionalSet(bestError, isLegalAndBetterFloat, combinedError);
5686                                         ParallelMath::ConditionalSet(bestMode, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(mode)));
5687                                         ParallelMath::ConditionalSet(bestPartition, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(p)));
5688 
5689                                         for (int subset = 0; subset < numSubsets; subset++)
5690                                         {
5691                                             for (int epi = 0; epi < 2; epi++)
5692                                             {
5693                                                 for (int ch = 0; ch < 3; ch++)
5694                                                     ParallelMath::ConditionalSet(bestEndPoints[subset][epi][ch], isLegalAndBetter, encodedEPs[subset][epi][ch]);
5695                                             }
5696                                         }
5697 
5698                                         for (int px = 0; px < 16; px++)
5699                                         {
5700                                             int subset = ((partitionMask >> px) & 1);
5701                                             if (subset == 0)
5702                                                 ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta0][px]);
5703                                             else
5704                                                 ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta1][px]);
5705                                         }
5706 
5707                                         needsCommit = ParallelMath::AndNot(needsCommit, isLegalAndBetter);
5708                                         if (!ParallelMath::AnySet(needsCommit))
5709                                             break;
5710                                     }
5711                                 }
5712                             }
5713                         }
5714                     }
5715                 }
5716 
5717                 // At this point, everything should be set
5718                 for (int block = 0; block < ParallelMath::ParallelSize; block++)
5719                 {
5720                     ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(bestMode, block);
5721                     ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(bestPartition, block);
5722                     int32_t eps[2][2][3];
5723                     ParallelMath::ScalarUInt16 indexes[16];
5724 
5725                     const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
5726 
5727                     const BC6HData::ModeDescriptor* desc = BC6HData::g_modeDescriptors[mode];
5728 
5729                     const size_t headerBits = modeInfo.m_partitioned ? 82 : 65;
5730 
5731                     for (int subset = 0; subset < 2; subset++)
5732                     {
5733                         for (int epi = 0; epi < 2; epi++)
5734                         {
5735                             for (int ch = 0; ch < 3; ch++)
5736                                 eps[subset][epi][ch] = ParallelMath::Extract(bestEndPoints[subset][epi][ch], block);
5737                         }
5738                     }
5739 
5740                     for (int px = 0; px < 16; px++)
5741                         indexes[px] = ParallelMath::Extract(bestIndexes[px], block);
5742 
5743                     uint16_t modeID = modeInfo.m_modeID;
5744 
5745                     PackingVector pv;
5746                     pv.Init();
5747 
5748                     for (size_t i = 0; i < headerBits; i++)
5749                     {
5750                         int32_t codedValue = 0;
5751                         switch (desc[i].m_eField)
5752                         {
5753                         case BC6HData::M:  codedValue = modeID; break;
5754                         case BC6HData::D:  codedValue = partition; break;
5755                         case BC6HData::RW: codedValue = eps[0][0][0]; break;
5756                         case BC6HData::RX: codedValue = eps[0][1][0]; break;
5757                         case BC6HData::RY: codedValue = eps[1][0][0]; break;
5758                         case BC6HData::RZ: codedValue = eps[1][1][0]; break;
5759                         case BC6HData::GW: codedValue = eps[0][0][1]; break;
5760                         case BC6HData::GX: codedValue = eps[0][1][1]; break;
5761                         case BC6HData::GY: codedValue = eps[1][0][1]; break;
5762                         case BC6HData::GZ: codedValue = eps[1][1][1]; break;
5763                         case BC6HData::BW: codedValue = eps[0][0][2]; break;
5764                         case BC6HData::BX: codedValue = eps[0][1][2]; break;
5765                         case BC6HData::BY: codedValue = eps[1][0][2]; break;
5766                         case BC6HData::BZ: codedValue = eps[1][1][2]; break;
5767                         default: assert(false); break;
5768                         }
5769 
5770                         pv.Pack(static_cast<uint16_t>((codedValue >> desc[i].m_uBit) & 1), 1);
5771                     }
5772 
5773                     int fixupIndex1 = 0;
5774                     int indexBits = 4;
5775                     if (modeInfo.m_partitioned)
5776                     {
5777                         fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
5778                         indexBits = 3;
5779                     }
5780 
5781                     for (int px = 0; px < 16; px++)
5782                     {
5783                         ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[px], block);
5784                         if (px == 0 || px == fixupIndex1)
5785                             pv.Pack(index, indexBits - 1);
5786                         else
5787                             pv.Pack(index, indexBits);
5788                     }
5789 
5790                     pv.Flush(packedBlocks + 16 * block);
5791                 }
5792             }
5793 
SignExtendSingle(int & v,int bits)5794             static void SignExtendSingle(int &v, int bits)
5795             {
5796                 if (v & (1 << (bits - 1)))
5797                     v |= -(1 << bits);
5798             }
5799 
UnpackOne(PixelBlockF16 & output,const uint8_t * pBC,bool isSigned)5800             static void UnpackOne(PixelBlockF16 &output, const uint8_t *pBC, bool isSigned)
5801             {
5802                 UnpackingVector pv;
5803                 pv.Init(pBC);
5804 
5805                 int numModeBits = 2;
5806                 int modeBits = pv.Unpack(2);
5807                 if (modeBits != 0 && modeBits != 1)
5808                 {
5809                     modeBits |= pv.Unpack(3) << 2;
5810                     numModeBits += 3;
5811                 }
5812 
5813                 int mode = -1;
5814                 for (int possibleMode = 0; possibleMode < BC7Data::g_numHDRModes; possibleMode++)
5815                 {
5816                     if (BC7Data::g_hdrModes[possibleMode].m_modeID == modeBits)
5817                     {
5818                         mode = possibleMode;
5819                         break;
5820                     }
5821                 }
5822 
5823                 if (mode < 0)
5824                 {
5825                     for (int px = 0; px < 16; px++)
5826                     {
5827                         for (int ch = 0; ch < 3; ch++)
5828                             output.m_pixels[px][ch] = 0;
5829                         output.m_pixels[px][3] = 0x3c00;	// 1.0
5830                     }
5831                     return;
5832                 }
5833 
5834                 const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
5835                 const size_t headerBits = modeInfo.m_partitioned ? 82 : 65;
5836                 const BC6HData::ModeDescriptor* desc = BC6HData::g_modeDescriptors[mode];
5837 
5838                 int32_t partition = 0;
5839                 int32_t eps[2][2][3];
5840 
5841                 for (int subset = 0; subset < 2; subset++)
5842                     for (int epi = 0; epi < 2; epi++)
5843                         for (int ch = 0; ch < 3; ch++)
5844                             eps[subset][epi][ch] = 0;
5845 
5846                 for (size_t i = numModeBits; i < headerBits; i++)
5847                 {
5848                     int32_t *pCodedValue = NULL;
5849 
5850                     switch (desc[i].m_eField)
5851                     {
5852                     case BC6HData::D:  pCodedValue = &partition; break;
5853                     case BC6HData::RW: pCodedValue = &eps[0][0][0]; break;
5854                     case BC6HData::RX: pCodedValue = &eps[0][1][0]; break;
5855                     case BC6HData::RY: pCodedValue = &eps[1][0][0]; break;
5856                     case BC6HData::RZ: pCodedValue = &eps[1][1][0]; break;
5857                     case BC6HData::GW: pCodedValue = &eps[0][0][1]; break;
5858                     case BC6HData::GX: pCodedValue = &eps[0][1][1]; break;
5859                     case BC6HData::GY: pCodedValue = &eps[1][0][1]; break;
5860                     case BC6HData::GZ: pCodedValue = &eps[1][1][1]; break;
5861                     case BC6HData::BW: pCodedValue = &eps[0][0][2]; break;
5862                     case BC6HData::BX: pCodedValue = &eps[0][1][2]; break;
5863                     case BC6HData::BY: pCodedValue = &eps[1][0][2]; break;
5864                     case BC6HData::BZ: pCodedValue = &eps[1][1][2]; break;
5865                     default: assert(false); break;
5866                     }
5867 
5868                     (*pCodedValue) |= pv.Unpack(1) << desc[i].m_uBit;
5869                 }
5870 
5871 
5872                 uint16_t modeID = modeInfo.m_modeID;
5873 
5874                 int fixupIndex1 = 0;
5875                 int indexBits = 4;
5876                 int numSubsets = 1;
5877                 if (modeInfo.m_partitioned)
5878                 {
5879                     fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
5880                     indexBits = 3;
5881                     numSubsets = 2;
5882                 }
5883 
5884                 int indexes[16];
5885                 for (int px = 0; px < 16; px++)
5886                 {
5887                     if (px == 0 || px == fixupIndex1)
5888                         indexes[px] = pv.Unpack(indexBits - 1);
5889                     else
5890                         indexes[px] = pv.Unpack(indexBits);
5891                 }
5892 
5893                 if (modeInfo.m_partitioned)
5894                 {
5895                     for (int ch = 0; ch < 3; ch++)
5896                     {
5897                         if (isSigned)
5898                             SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
5899                         if (modeInfo.m_transformed || isSigned)
5900                         {
5901                             SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
5902                             SignExtendSingle(eps[1][0][ch], modeInfo.m_bPrec[ch]);
5903                             SignExtendSingle(eps[1][1][ch], modeInfo.m_bPrec[ch]);
5904                         }
5905                     }
5906                 }
5907                 else
5908                 {
5909                     for (int ch = 0; ch < 3; ch++)
5910                     {
5911                         if (isSigned)
5912                             SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
5913                         if (modeInfo.m_transformed || isSigned)
5914                             SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
5915                     }
5916                 }
5917 
5918                 int aPrec = modeInfo.m_aPrec;
5919 
5920                 if (modeInfo.m_transformed)
5921                 {
5922                     for (int ch = 0; ch < 3; ch++)
5923                     {
5924                         int wrapMask = (1 << aPrec) - 1;
5925 
5926                         eps[0][1][ch] = ((eps[0][0][ch] + eps[0][1][ch]) & wrapMask);
5927                         if (isSigned)
5928                             SignExtendSingle(eps[0][1][ch], aPrec);
5929 
5930                         if (modeInfo.m_partitioned)
5931                         {
5932                             eps[1][0][ch] = ((eps[0][0][ch] + eps[1][0][ch]) & wrapMask);
5933                             eps[1][1][ch] = ((eps[0][0][ch] + eps[1][1][ch]) & wrapMask);
5934 
5935                             if (isSigned)
5936                             {
5937                                 SignExtendSingle(eps[1][0][ch], aPrec);
5938                                 SignExtendSingle(eps[1][1][ch], aPrec);
5939                             }
5940                         }
5941                     }
5942                 }
5943 
5944                 // Unquantize endpoints
5945                 for (int subset = 0; subset < numSubsets; subset++)
5946                 {
5947                     for (int epi = 0; epi < 2; epi++)
5948                     {
5949                         for (int ch = 0; ch < 3; ch++)
5950                         {
5951                             int &v = eps[subset][epi][ch];
5952 
5953                             if (isSigned)
5954                             {
5955                                 if (aPrec >= 16)
5956                                 {
5957                                     // Nothing
5958                                 }
5959                                 else
5960                                 {
5961                                     bool s = false;
5962                                     int comp = v;
5963                                     if (v < 0)
5964                                     {
5965                                         s = true;
5966                                         comp = -comp;
5967                                     }
5968 
5969                                     int unq = 0;
5970                                     if (comp == 0)
5971                                         unq = 0;
5972                                     else if (comp >= ((1 << (aPrec - 1)) - 1))
5973                                         unq = 0x7fff;
5974                                     else
5975                                         unq = ((comp << 15) + 0x4000) >> (aPrec - 1);
5976 
5977                                     if (s)
5978                                         unq = -unq;
5979 
5980                                     v = unq;
5981                                 }
5982                             }
5983                             else
5984                             {
5985                                 if (aPrec >= 15)
5986                                 {
5987                                     // Nothing
5988                                 }
5989                                 else if (v == 0)
5990                                 {
5991                                     // Nothing
5992                                 }
5993                                 else if (v == ((1 << aPrec) - 1))
5994                                     v = 0xffff;
5995                                 else
5996                                     v = ((v << 16) + 0x8000) >> aPrec;
5997                             }
5998                         }
5999                     }
6000                 }
6001 
6002                 const int *weights = BC7Data::g_weightTables[indexBits];
6003 
6004                 for (int px = 0; px < 16; px++)
6005                 {
6006                     int subset = 0;
6007                     if (modeInfo.m_partitioned)
6008                         subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
6009 
6010                     int w = weights[indexes[px]];
6011                     for (int ch = 0; ch < 3; ch++)
6012                     {
6013                         int comp = ((64 - w) * eps[subset][0][ch] + w * eps[subset][1][ch] + 32) >> 6;
6014 
6015                         if (isSigned)
6016                         {
6017                             if (comp < 0)
6018                                 comp = -(((-comp) * 31) >> 5);
6019                             else
6020                                 comp = (comp * 31) >> 5;
6021 
6022                             int s = 0;
6023                             if (comp < 0)
6024                             {
6025                                 s = 0x8000;
6026                                 comp = -comp;
6027                             }
6028 
6029                             output.m_pixels[px][ch] = static_cast<uint16_t>(s | comp);
6030                         }
6031                         else
6032                         {
6033                             comp = (comp * 31) >> 6;
6034                             output.m_pixels[px][ch] = static_cast<uint16_t>(comp);
6035                         }
6036                     }
6037                     output.m_pixels[px][3] = 0x3c00;	// 1.0
6038                 }
6039             }
6040         };
6041 
6042         namespace S3TCSingleColorTables
6043         {
6044             struct SingleColorTableEntry
6045             {
6046                 uint8_t m_min;
6047                 uint8_t m_max;
6048                 uint8_t m_actualColor;
6049                 uint8_t m_span;
6050             };
6051 
6052             SingleColorTableEntry g_singleColor5_3[256] =
6053             {
6054                 { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 8, 0, 2, 8 }, { 8, 0, 2, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 8, 8, 8, 0 },
6055                 { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 16, 8, 10, 8 }, { 33, 0, 11, 33 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 16, 16, 16, 0 },
6056                 { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 24, 16, 18, 8 }, { 41, 8, 19, 33 }, { 16, 24, 21, 8 }, { 16, 24, 21, 8 }, { 0, 33, 22, 33 }, { 24, 24, 24, 0 },
6057                 { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 41, 24, 29, 17 }, { 24, 33, 30, 9 }, { 24, 33, 30, 9 },
6058                 { 16, 41, 32, 25 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 41, 33, 35, 8 }, { 41, 33, 35, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 },
6059                 { 24, 49, 40, 25 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 49, 41, 43, 8 }, { 66, 33, 44, 33 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 },
6060                 { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 57, 49, 51, 8 }, { 74, 41, 52, 33 }, { 49, 57, 54, 8 }, { 49, 57, 54, 8 }, { 33, 66, 55, 33 },
6061                 { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 74, 57, 62, 17 }, { 57, 66, 63, 9 },
6062                 { 57, 66, 63, 9 }, { 49, 74, 65, 25 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 74, 66, 68, 8 }, { 74, 66, 68, 8 }, { 66, 74, 71, 8 }, { 66, 74, 71, 8 },
6063                 { 66, 74, 71, 8 }, { 57, 82, 73, 25 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 82, 74, 76, 8 }, { 99, 66, 77, 33 }, { 74, 82, 79, 8 }, { 74, 82, 79, 8 },
6064                 { 74, 82, 79, 8 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 90, 82, 84, 8 }, { 107, 74, 85, 33 }, { 82, 90, 87, 8 }, { 82, 90, 87, 8 },
6065                 { 66, 99, 88, 33 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 107, 90, 95, 17 },
6066                 { 90, 99, 96, 9 }, { 90, 99, 96, 9 }, { 82, 107, 98, 25 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 107, 99, 101, 8 }, { 107, 99, 101, 8 }, { 99, 107, 104, 8 },
6067                 { 99, 107, 104, 8 }, { 99, 107, 104, 8 }, { 90, 115, 106, 25 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 115, 107, 109, 8 }, { 132, 99, 110, 33 }, { 107, 115, 112, 8 },
6068                 { 107, 115, 112, 8 }, { 107, 115, 112, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 123, 115, 117, 8 }, { 140, 107, 118, 33 }, { 115, 123, 120, 8 },
6069                 { 115, 123, 120, 8 }, { 99, 132, 121, 33 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 },
6070                 { 140, 123, 128, 17 }, { 123, 132, 129, 9 }, { 123, 132, 129, 9 }, { 115, 140, 131, 25 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 140, 132, 134, 8 }, { 140, 132, 134, 8 },
6071                 { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 123, 148, 139, 25 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 148, 140, 142, 8 }, { 165, 132, 143, 33 },
6072                 { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 156, 148, 150, 8 }, { 173, 140, 151, 33 },
6073                 { 148, 156, 153, 8 }, { 148, 156, 153, 8 }, { 132, 165, 154, 33 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 165, 156, 159, 9 }, { 165, 156, 159, 9 },
6074                 { 165, 156, 159, 9 }, { 173, 156, 161, 17 }, { 156, 165, 162, 9 }, { 156, 165, 162, 9 }, { 148, 173, 164, 25 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 173, 165, 167, 8 },
6075                 { 173, 165, 167, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 156, 181, 172, 25 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 181, 173, 175, 8 },
6076                 { 198, 165, 176, 33 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 189, 181, 183, 8 },
6077                 { 206, 173, 184, 33 }, { 181, 189, 186, 8 }, { 181, 189, 186, 8 }, { 165, 198, 187, 33 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 198, 189, 192, 9 },
6078                 { 198, 189, 192, 9 }, { 198, 189, 192, 9 }, { 206, 189, 194, 17 }, { 189, 198, 195, 9 }, { 189, 198, 195, 9 }, { 181, 206, 197, 25 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
6079                 { 206, 198, 200, 8 }, { 206, 198, 200, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 189, 214, 205, 25 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
6080                 { 214, 206, 208, 8 }, { 231, 198, 209, 33 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
6081                 { 222, 214, 216, 8 }, { 239, 206, 217, 33 }, { 214, 222, 219, 8 }, { 214, 222, 219, 8 }, { 198, 231, 220, 33 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
6082                 { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 239, 222, 227, 17 }, { 222, 231, 228, 9 }, { 222, 231, 228, 9 }, { 214, 239, 230, 25 }, { 231, 231, 231, 0 },
6083                 { 231, 231, 231, 0 }, { 239, 231, 233, 8 }, { 239, 231, 233, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 222, 247, 238, 25 }, { 239, 239, 239, 0 },
6084                 { 239, 239, 239, 0 }, { 247, 239, 241, 8 }, { 247, 239, 241, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
6085                 { 247, 247, 247, 0 }, { 255, 247, 249, 8 }, { 255, 247, 249, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
6086             };
6087 
6088             SingleColorTableEntry g_singleColor6_3[256] =
6089             {
6090                 { 0, 0, 0, 0 }, { 4, 0, 1, 4 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 8, 4, 5, 4 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
6091                 { 8, 8, 8, 0 }, { 12, 8, 9, 4 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 16, 12, 13, 4 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
6092                 { 16, 16, 16, 0 }, { 20, 16, 17, 4 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 24, 20, 21, 4 }, { 20, 24, 22, 4 }, { 69, 0, 23, 69 },
6093                 { 24, 24, 24, 0 }, { 28, 24, 25, 4 }, { 24, 28, 26, 4 }, { 65, 8, 27, 57 }, { 28, 28, 28, 0 }, { 32, 28, 29, 4 }, { 28, 32, 30, 4 }, { 69, 12, 31, 57 },
6094                 { 32, 32, 32, 0 }, { 36, 32, 33, 4 }, { 32, 36, 34, 4 }, { 65, 20, 35, 45 }, { 36, 36, 36, 0 }, { 40, 36, 37, 4 }, { 36, 40, 38, 4 }, { 69, 24, 39, 45 },
6095                 { 40, 40, 40, 0 }, { 44, 40, 41, 4 }, { 40, 44, 42, 4 }, { 65, 32, 43, 33 }, { 44, 44, 44, 0 }, { 48, 44, 45, 4 }, { 44, 48, 46, 4 }, { 69, 36, 47, 33 },
6096                 { 48, 48, 48, 0 }, { 52, 48, 49, 4 }, { 48, 52, 50, 4 }, { 65, 44, 51, 21 }, { 52, 52, 52, 0 }, { 56, 52, 53, 4 }, { 52, 56, 54, 4 }, { 69, 48, 55, 21 },
6097                 { 56, 56, 56, 0 }, { 60, 56, 57, 4 }, { 56, 60, 58, 4 }, { 65, 56, 59, 9 }, { 60, 60, 60, 0 }, { 65, 60, 61, 5 }, { 56, 65, 62, 9 }, { 60, 65, 63, 5 },
6098                 { 56, 69, 64, 13 }, { 65, 65, 65, 0 }, { 69, 65, 66, 4 }, { 65, 69, 67, 4 }, { 60, 73, 68, 13 }, { 69, 69, 69, 0 }, { 73, 69, 70, 4 }, { 69, 73, 71, 4 },
6099                 { 56, 81, 72, 25 }, { 73, 73, 73, 0 }, { 77, 73, 74, 4 }, { 73, 77, 75, 4 }, { 60, 85, 76, 25 }, { 77, 77, 77, 0 }, { 81, 77, 78, 4 }, { 77, 81, 79, 4 },
6100                 { 56, 93, 80, 37 }, { 81, 81, 81, 0 }, { 85, 81, 82, 4 }, { 81, 85, 83, 4 }, { 60, 97, 84, 37 }, { 85, 85, 85, 0 }, { 89, 85, 86, 4 }, { 85, 89, 87, 4 },
6101                 { 56, 105, 88, 49 }, { 89, 89, 89, 0 }, { 93, 89, 90, 4 }, { 89, 93, 91, 4 }, { 60, 109, 92, 49 }, { 93, 93, 93, 0 }, { 97, 93, 94, 4 }, { 93, 97, 95, 4 },
6102                 { 134, 77, 96, 57 }, { 97, 97, 97, 0 }, { 101, 97, 98, 4 }, { 97, 101, 99, 4 }, { 130, 85, 100, 45 }, { 101, 101, 101, 0 }, { 105, 101, 102, 4 }, { 101, 105, 103, 4 },
6103                 { 134, 89, 104, 45 }, { 105, 105, 105, 0 }, { 109, 105, 106, 4 }, { 105, 109, 107, 4 }, { 130, 97, 108, 33 }, { 109, 109, 109, 0 }, { 113, 109, 110, 4 }, { 109, 113, 111, 4 },
6104                 { 134, 101, 112, 33 }, { 113, 113, 113, 0 }, { 117, 113, 114, 4 }, { 113, 117, 115, 4 }, { 130, 109, 116, 21 }, { 117, 117, 117, 0 }, { 121, 117, 118, 4 }, { 117, 121, 119, 4 },
6105                 { 134, 113, 120, 21 }, { 121, 121, 121, 0 }, { 125, 121, 122, 4 }, { 121, 125, 123, 4 }, { 130, 121, 124, 9 }, { 125, 125, 125, 0 }, { 130, 125, 126, 5 }, { 121, 130, 127, 9 },
6106                 { 125, 130, 128, 5 }, { 121, 134, 129, 13 }, { 130, 130, 130, 0 }, { 134, 130, 131, 4 }, { 130, 134, 132, 4 }, { 125, 138, 133, 13 }, { 134, 134, 134, 0 }, { 138, 134, 135, 4 },
6107                 { 134, 138, 136, 4 }, { 121, 146, 137, 25 }, { 138, 138, 138, 0 }, { 142, 138, 139, 4 }, { 138, 142, 140, 4 }, { 125, 150, 141, 25 }, { 142, 142, 142, 0 }, { 146, 142, 143, 4 },
6108                 { 142, 146, 144, 4 }, { 121, 158, 145, 37 }, { 146, 146, 146, 0 }, { 150, 146, 147, 4 }, { 146, 150, 148, 4 }, { 125, 162, 149, 37 }, { 150, 150, 150, 0 }, { 154, 150, 151, 4 },
6109                 { 150, 154, 152, 4 }, { 121, 170, 153, 49 }, { 154, 154, 154, 0 }, { 158, 154, 155, 4 }, { 154, 158, 156, 4 }, { 125, 174, 157, 49 }, { 158, 158, 158, 0 }, { 162, 158, 159, 4 },
6110                 { 158, 162, 160, 4 }, { 199, 142, 161, 57 }, { 162, 162, 162, 0 }, { 166, 162, 163, 4 }, { 162, 166, 164, 4 }, { 195, 150, 165, 45 }, { 166, 166, 166, 0 }, { 170, 166, 167, 4 },
6111                 { 166, 170, 168, 4 }, { 199, 154, 169, 45 }, { 170, 170, 170, 0 }, { 174, 170, 171, 4 }, { 170, 174, 172, 4 }, { 195, 162, 173, 33 }, { 174, 174, 174, 0 }, { 178, 174, 175, 4 },
6112                 { 174, 178, 176, 4 }, { 199, 166, 177, 33 }, { 178, 178, 178, 0 }, { 182, 178, 179, 4 }, { 178, 182, 180, 4 }, { 195, 174, 181, 21 }, { 182, 182, 182, 0 }, { 186, 182, 183, 4 },
6113                 { 182, 186, 184, 4 }, { 199, 178, 185, 21 }, { 186, 186, 186, 0 }, { 190, 186, 187, 4 }, { 186, 190, 188, 4 }, { 195, 186, 189, 9 }, { 190, 190, 190, 0 }, { 195, 190, 191, 5 },
6114                 { 186, 195, 192, 9 }, { 190, 195, 193, 5 }, { 186, 199, 194, 13 }, { 195, 195, 195, 0 }, { 199, 195, 196, 4 }, { 195, 199, 197, 4 }, { 190, 203, 198, 13 }, { 199, 199, 199, 0 },
6115                 { 203, 199, 200, 4 }, { 199, 203, 201, 4 }, { 186, 211, 202, 25 }, { 203, 203, 203, 0 }, { 207, 203, 204, 4 }, { 203, 207, 205, 4 }, { 190, 215, 206, 25 }, { 207, 207, 207, 0 },
6116                 { 211, 207, 208, 4 }, { 207, 211, 209, 4 }, { 186, 223, 210, 37 }, { 211, 211, 211, 0 }, { 215, 211, 212, 4 }, { 211, 215, 213, 4 }, { 190, 227, 214, 37 }, { 215, 215, 215, 0 },
6117                 { 219, 215, 216, 4 }, { 215, 219, 217, 4 }, { 186, 235, 218, 49 }, { 219, 219, 219, 0 }, { 223, 219, 220, 4 }, { 219, 223, 221, 4 }, { 190, 239, 222, 49 }, { 223, 223, 223, 0 },
6118                 { 227, 223, 224, 4 }, { 223, 227, 225, 4 }, { 186, 247, 226, 61 }, { 227, 227, 227, 0 }, { 231, 227, 228, 4 }, { 227, 231, 229, 4 }, { 190, 251, 230, 61 }, { 231, 231, 231, 0 },
6119                 { 235, 231, 232, 4 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 239, 235, 236, 4 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
6120                 { 243, 239, 240, 4 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 247, 243, 244, 4 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
6121                 { 251, 247, 248, 4 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 255, 251, 252, 4 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
6122             };
6123 
6124             SingleColorTableEntry g_singleColor5_2[256] =
6125             {
6126                 { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 },
6127                 { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 },
6128                 { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 },
6129                 { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 41, 32, 17 },
6130                 { 24, 41, 32, 17 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 24, 49, 36, 25 }, { 24, 49, 36, 25 }, { 33, 41, 37, 8 }, { 33, 41, 37, 8 }, { 24, 57, 40, 33 },
6131                 { 24, 57, 40, 33 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 49, 49, 49, 0 },
6132                 { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 57, 57, 57, 0 },
6133                 { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 },
6134                 { 57, 74, 65, 17 }, { 57, 74, 65, 17 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 57, 82, 69, 25 }, { 57, 82, 69, 25 }, { 66, 74, 70, 8 }, { 66, 74, 70, 8 },
6135                 { 57, 90, 73, 33 }, { 57, 90, 73, 33 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 },
6136                 { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 },
6137                 { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 },
6138                 { 90, 99, 94, 9 }, { 90, 107, 98, 17 }, { 90, 107, 98, 17 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 90, 115, 102, 25 }, { 90, 115, 102, 25 }, { 99, 107, 103, 8 },
6139                 { 99, 107, 103, 8 }, { 90, 123, 106, 33 }, { 90, 123, 106, 33 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 111, 8 }, { 107, 115, 111, 8 },
6140                 { 107, 115, 111, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 119, 8 }, { 115, 123, 119, 8 },
6141                 { 115, 123, 119, 8 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 127, 9 }, { 123, 132, 127, 9 },
6142                 { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, { 123, 140, 131, 17 }, { 123, 140, 131, 17 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 123, 148, 135, 25 }, { 123, 148, 135, 25 },
6143                 { 132, 140, 136, 8 }, { 132, 140, 136, 8 }, { 123, 156, 139, 33 }, { 123, 156, 139, 33 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 144, 8 },
6144                 { 140, 148, 144, 8 }, { 140, 148, 144, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 152, 8 },
6145                 { 148, 156, 152, 8 }, { 148, 156, 152, 8 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 160, 9 },
6146                 { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 173, 164, 17 }, { 156, 173, 164, 17 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 156, 181, 168, 25 },
6147                 { 156, 181, 168, 25 }, { 165, 173, 169, 8 }, { 165, 173, 169, 8 }, { 156, 189, 172, 33 }, { 156, 189, 172, 33 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 },
6148                 { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 },
6149                 { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 },
6150                 { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 206, 197, 17 }, { 189, 206, 197, 17 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
6151                 { 189, 214, 201, 25 }, { 189, 214, 201, 25 }, { 198, 206, 202, 8 }, { 198, 206, 202, 8 }, { 189, 222, 205, 33 }, { 189, 222, 205, 33 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
6152                 { 206, 206, 206, 0 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
6153                 { 214, 214, 214, 0 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
6154                 { 222, 222, 222, 0 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 239, 230, 17 }, { 222, 239, 230, 17 }, { 231, 231, 231, 0 },
6155                 { 231, 231, 231, 0 }, { 222, 247, 234, 25 }, { 222, 247, 234, 25 }, { 231, 239, 235, 8 }, { 231, 239, 235, 8 }, { 222, 255, 238, 33 }, { 222, 255, 238, 33 }, { 239, 239, 239, 0 },
6156                 { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
6157                 { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
6158             };
6159 
6160             SingleColorTableEntry g_singleColor6_2[256] =
6161             {
6162                 { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
6163                 { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
6164                 { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
6165                 { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
6166                 { 32, 32, 32, 0 }, { 32, 32, 32, 0 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
6167                 { 40, 40, 40, 0 }, { 40, 40, 40, 0 }, { 40, 44, 42, 4 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 48, 46, 4 }, { 48, 48, 48, 0 },
6168                 { 48, 48, 48, 0 }, { 48, 48, 48, 0 }, { 48, 52, 50, 4 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 56, 54, 4 }, { 56, 56, 56, 0 },
6169                 { 56, 56, 56, 0 }, { 56, 56, 56, 0 }, { 56, 60, 58, 4 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 65, 62, 5 }, { 60, 65, 62, 5 },
6170                 { 60, 69, 64, 9 }, { 65, 65, 65, 0 }, { 60, 73, 66, 13 }, { 65, 69, 67, 4 }, { 60, 77, 68, 17 }, { 69, 69, 69, 0 }, { 60, 81, 70, 21 }, { 69, 73, 71, 4 },
6171                 { 60, 85, 72, 25 }, { 73, 73, 73, 0 }, { 60, 89, 74, 29 }, { 73, 77, 75, 4 }, { 60, 93, 76, 33 }, { 77, 77, 77, 0 }, { 60, 97, 78, 37 }, { 77, 81, 79, 4 },
6172                 { 60, 101, 80, 41 }, { 81, 81, 81, 0 }, { 60, 105, 82, 45 }, { 81, 85, 83, 4 }, { 60, 109, 84, 49 }, { 85, 85, 85, 0 }, { 60, 113, 86, 53 }, { 85, 89, 87, 4 },
6173                 { 60, 117, 88, 57 }, { 89, 89, 89, 0 }, { 60, 121, 90, 61 }, { 89, 93, 91, 4 }, { 60, 125, 92, 65 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 95, 4 },
6174                 { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 103, 4 },
6175                 { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 107, 4 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 113, 111, 4 },
6176                 { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 117, 115, 4 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 121, 119, 4 },
6177                 { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 125, 123, 4 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 130, 127, 5 },
6178                 { 125, 130, 127, 5 }, { 125, 134, 129, 9 }, { 130, 130, 130, 0 }, { 125, 138, 131, 13 }, { 130, 134, 132, 4 }, { 125, 142, 133, 17 }, { 134, 134, 134, 0 }, { 125, 146, 135, 21 },
6179                 { 134, 138, 136, 4 }, { 125, 150, 137, 25 }, { 138, 138, 138, 0 }, { 125, 154, 139, 29 }, { 138, 142, 140, 4 }, { 125, 158, 141, 33 }, { 142, 142, 142, 0 }, { 125, 162, 143, 37 },
6180                 { 142, 146, 144, 4 }, { 125, 166, 145, 41 }, { 146, 146, 146, 0 }, { 125, 170, 147, 45 }, { 146, 150, 148, 4 }, { 125, 174, 149, 49 }, { 150, 150, 150, 0 }, { 125, 178, 151, 53 },
6181                 { 150, 154, 152, 4 }, { 125, 182, 153, 57 }, { 154, 154, 154, 0 }, { 125, 186, 155, 61 }, { 154, 158, 156, 4 }, { 125, 190, 157, 65 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 },
6182                 { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 },
6183                 { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 172, 4 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 },
6184                 { 174, 178, 176, 4 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 182, 180, 4 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 },
6185                 { 182, 186, 184, 4 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 190, 188, 4 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 },
6186                 { 190, 195, 192, 5 }, { 190, 195, 192, 5 }, { 190, 199, 194, 9 }, { 195, 195, 195, 0 }, { 190, 203, 196, 13 }, { 195, 199, 197, 4 }, { 190, 207, 198, 17 }, { 199, 199, 199, 0 },
6187                 { 190, 211, 200, 21 }, { 199, 203, 201, 4 }, { 190, 215, 202, 25 }, { 203, 203, 203, 0 }, { 190, 219, 204, 29 }, { 203, 207, 205, 4 }, { 190, 223, 206, 33 }, { 207, 207, 207, 0 },
6188                 { 190, 227, 208, 37 }, { 207, 211, 209, 4 }, { 190, 231, 210, 41 }, { 211, 211, 211, 0 }, { 190, 235, 212, 45 }, { 211, 215, 213, 4 }, { 190, 239, 214, 49 }, { 215, 215, 215, 0 },
6189                 { 190, 243, 216, 53 }, { 215, 219, 217, 4 }, { 190, 247, 218, 57 }, { 219, 219, 219, 0 }, { 190, 251, 220, 61 }, { 219, 223, 221, 4 }, { 190, 255, 222, 65 }, { 223, 223, 223, 0 },
6190                 { 223, 223, 223, 0 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
6191                 { 231, 231, 231, 0 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
6192                 { 239, 239, 239, 0 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
6193                 { 247, 247, 247, 0 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
6194             };
6195 
6196             SingleColorTableEntry g_singleColor5_3_p[256] =
6197             {
6198                 { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 8, 0, 2, 8 }, { 8, 0, 2, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 8, 8, 8, 0 },
6199                 { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 16, 8, 10, 8 }, { 33, 0, 11, 33 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 16, 16, 16, 0 },
6200                 { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 24, 16, 18, 8 }, { 41, 8, 19, 33 }, { 16, 24, 21, 8 }, { 16, 24, 21, 8 }, { 0, 33, 22, 33 }, { 24, 24, 24, 0 },
6201                 { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 41, 24, 29, 17 }, { 24, 33, 30, 9 }, { 24, 33, 30, 9 },
6202                 { 16, 41, 32, 25 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 41, 33, 35, 8 }, { 41, 33, 35, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 },
6203                 { 24, 49, 40, 25 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 49, 41, 43, 8 }, { 66, 33, 44, 33 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 },
6204                 { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 57, 49, 51, 8 }, { 74, 41, 52, 33 }, { 49, 57, 54, 8 }, { 49, 57, 54, 8 }, { 33, 66, 55, 33 },
6205                 { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 74, 57, 62, 17 }, { 57, 66, 63, 9 },
6206                 { 57, 66, 63, 9 }, { 49, 74, 65, 25 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 74, 66, 68, 8 }, { 74, 66, 68, 8 }, { 66, 74, 71, 8 }, { 66, 74, 71, 8 },
6207                 { 66, 74, 71, 8 }, { 57, 82, 73, 25 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 82, 74, 76, 8 }, { 99, 66, 77, 33 }, { 74, 82, 79, 8 }, { 74, 82, 79, 8 },
6208                 { 74, 82, 79, 8 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 90, 82, 84, 8 }, { 107, 74, 85, 33 }, { 82, 90, 87, 8 }, { 82, 90, 87, 8 },
6209                 { 66, 99, 88, 33 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 107, 90, 95, 17 },
6210                 { 90, 99, 96, 9 }, { 90, 99, 96, 9 }, { 82, 107, 98, 25 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 107, 99, 101, 8 }, { 107, 99, 101, 8 }, { 99, 107, 104, 8 },
6211                 { 99, 107, 104, 8 }, { 99, 107, 104, 8 }, { 90, 115, 106, 25 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 115, 107, 109, 8 }, { 132, 99, 110, 33 }, { 107, 115, 112, 8 },
6212                 { 107, 115, 112, 8 }, { 107, 115, 112, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 123, 115, 117, 8 }, { 140, 107, 118, 33 }, { 115, 123, 120, 8 },
6213                 { 115, 123, 120, 8 }, { 99, 132, 121, 33 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 },
6214                 { 140, 123, 128, 17 }, { 123, 132, 129, 9 }, { 123, 132, 129, 9 }, { 115, 140, 131, 25 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 140, 132, 134, 8 }, { 140, 132, 134, 8 },
6215                 { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 123, 148, 139, 25 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 148, 140, 142, 8 }, { 165, 132, 143, 33 },
6216                 { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 156, 148, 150, 8 }, { 173, 140, 151, 33 },
6217                 { 148, 156, 153, 8 }, { 148, 156, 153, 8 }, { 132, 165, 154, 33 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 165, 156, 159, 9 }, { 165, 156, 159, 9 },
6218                 { 165, 156, 159, 9 }, { 173, 156, 161, 17 }, { 156, 165, 162, 9 }, { 156, 165, 162, 9 }, { 148, 173, 164, 25 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 173, 165, 167, 8 },
6219                 { 173, 165, 167, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 156, 181, 172, 25 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 181, 173, 175, 8 },
6220                 { 198, 165, 176, 33 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 189, 181, 183, 8 },
6221                 { 206, 173, 184, 33 }, { 181, 189, 186, 8 }, { 181, 189, 186, 8 }, { 165, 198, 187, 33 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 198, 189, 192, 9 },
6222                 { 198, 189, 192, 9 }, { 198, 189, 192, 9 }, { 206, 189, 194, 17 }, { 189, 198, 195, 9 }, { 189, 198, 195, 9 }, { 181, 206, 197, 25 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
6223                 { 206, 198, 200, 8 }, { 206, 198, 200, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 189, 214, 205, 25 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
6224                 { 214, 206, 208, 8 }, { 231, 198, 209, 33 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
6225                 { 222, 214, 216, 8 }, { 239, 206, 217, 33 }, { 214, 222, 219, 8 }, { 214, 222, 219, 8 }, { 198, 231, 220, 33 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
6226                 { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 239, 222, 227, 17 }, { 222, 231, 228, 9 }, { 222, 231, 228, 9 }, { 214, 239, 230, 25 }, { 231, 231, 231, 0 },
6227                 { 231, 231, 231, 0 }, { 239, 231, 233, 8 }, { 239, 231, 233, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 222, 247, 238, 25 }, { 239, 239, 239, 0 },
6228                 { 239, 239, 239, 0 }, { 247, 239, 241, 8 }, { 247, 239, 241, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
6229                 { 247, 247, 247, 0 }, { 255, 247, 249, 8 }, { 255, 247, 249, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
6230             };
6231 
6232             SingleColorTableEntry g_singleColor6_3_p[256] =
6233             {
6234                 { 0, 0, 0, 0 }, { 4, 0, 1, 4 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 8, 4, 5, 4 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
6235                 { 8, 8, 8, 0 }, { 12, 8, 9, 4 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 16, 12, 13, 4 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
6236                 { 16, 16, 16, 0 }, { 20, 16, 17, 4 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 24, 20, 21, 4 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
6237                 { 24, 24, 24, 0 }, { 28, 24, 25, 4 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 32, 28, 29, 4 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
6238                 { 32, 32, 32, 0 }, { 36, 32, 33, 4 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 40, 36, 37, 4 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
6239                 { 40, 40, 40, 0 }, { 44, 40, 41, 4 }, { 40, 44, 42, 4 }, { 65, 32, 43, 33 }, { 44, 44, 44, 0 }, { 48, 44, 45, 4 }, { 44, 48, 46, 4 }, { 69, 36, 47, 33 },
6240                 { 48, 48, 48, 0 }, { 52, 48, 49, 4 }, { 48, 52, 50, 4 }, { 65, 44, 51, 21 }, { 52, 52, 52, 0 }, { 56, 52, 53, 4 }, { 52, 56, 54, 4 }, { 69, 48, 55, 21 },
6241                 { 56, 56, 56, 0 }, { 60, 56, 57, 4 }, { 56, 60, 58, 4 }, { 65, 56, 59, 9 }, { 60, 60, 60, 0 }, { 65, 60, 61, 5 }, { 56, 65, 62, 9 }, { 60, 65, 63, 5 },
6242                 { 56, 69, 64, 13 }, { 65, 65, 65, 0 }, { 69, 65, 66, 4 }, { 65, 69, 67, 4 }, { 60, 73, 68, 13 }, { 69, 69, 69, 0 }, { 73, 69, 70, 4 }, { 69, 73, 71, 4 },
6243                 { 56, 81, 72, 25 }, { 73, 73, 73, 0 }, { 77, 73, 74, 4 }, { 73, 77, 75, 4 }, { 60, 85, 76, 25 }, { 77, 77, 77, 0 }, { 81, 77, 78, 4 }, { 77, 81, 79, 4 },
6244                 { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 85, 81, 82, 4 }, { 81, 85, 83, 4 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 89, 85, 86, 4 }, { 85, 89, 87, 4 },
6245                 { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 93, 89, 90, 4 }, { 89, 93, 91, 4 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 97, 93, 94, 4 }, { 93, 97, 95, 4 },
6246                 { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 101, 97, 98, 4 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 105, 101, 102, 4 }, { 101, 105, 103, 4 },
6247                 { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 109, 105, 106, 4 }, { 105, 109, 107, 4 }, { 130, 97, 108, 33 }, { 109, 109, 109, 0 }, { 113, 109, 110, 4 }, { 109, 113, 111, 4 },
6248                 { 134, 101, 112, 33 }, { 113, 113, 113, 0 }, { 117, 113, 114, 4 }, { 113, 117, 115, 4 }, { 130, 109, 116, 21 }, { 117, 117, 117, 0 }, { 121, 117, 118, 4 }, { 117, 121, 119, 4 },
6249                 { 134, 113, 120, 21 }, { 121, 121, 121, 0 }, { 125, 121, 122, 4 }, { 121, 125, 123, 4 }, { 130, 121, 124, 9 }, { 125, 125, 125, 0 }, { 130, 125, 126, 5 }, { 121, 130, 127, 9 },
6250                 { 125, 130, 128, 5 }, { 121, 134, 129, 13 }, { 130, 130, 130, 0 }, { 134, 130, 131, 4 }, { 130, 134, 132, 4 }, { 125, 138, 133, 13 }, { 134, 134, 134, 0 }, { 138, 134, 135, 4 },
6251                 { 134, 138, 136, 4 }, { 121, 146, 137, 25 }, { 138, 138, 138, 0 }, { 142, 138, 139, 4 }, { 138, 142, 140, 4 }, { 125, 150, 141, 25 }, { 142, 142, 142, 0 }, { 146, 142, 143, 4 },
6252                 { 142, 146, 144, 4 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 150, 146, 147, 4 }, { 146, 150, 148, 4 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, { 154, 150, 151, 4 },
6253                 { 150, 154, 152, 4 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 158, 154, 155, 4 }, { 154, 158, 156, 4 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, { 162, 158, 159, 4 },
6254                 { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 166, 162, 163, 4 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 170, 166, 167, 4 },
6255                 { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 174, 170, 171, 4 }, { 170, 174, 172, 4 }, { 195, 162, 173, 33 }, { 174, 174, 174, 0 }, { 178, 174, 175, 4 },
6256                 { 174, 178, 176, 4 }, { 199, 166, 177, 33 }, { 178, 178, 178, 0 }, { 182, 178, 179, 4 }, { 178, 182, 180, 4 }, { 195, 174, 181, 21 }, { 182, 182, 182, 0 }, { 186, 182, 183, 4 },
6257                 { 182, 186, 184, 4 }, { 199, 178, 185, 21 }, { 186, 186, 186, 0 }, { 190, 186, 187, 4 }, { 186, 190, 188, 4 }, { 195, 186, 189, 9 }, { 190, 190, 190, 0 }, { 195, 190, 191, 5 },
6258                 { 186, 195, 192, 9 }, { 190, 195, 193, 5 }, { 186, 199, 194, 13 }, { 195, 195, 195, 0 }, { 199, 195, 196, 4 }, { 195, 199, 197, 4 }, { 190, 203, 198, 13 }, { 199, 199, 199, 0 },
6259                 { 203, 199, 200, 4 }, { 199, 203, 201, 4 }, { 186, 211, 202, 25 }, { 203, 203, 203, 0 }, { 207, 203, 204, 4 }, { 203, 207, 205, 4 }, { 190, 215, 206, 25 }, { 207, 207, 207, 0 },
6260                 { 211, 207, 208, 4 }, { 207, 211, 209, 4 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 215, 211, 212, 4 }, { 211, 215, 213, 4 }, { 215, 215, 215, 0 }, { 215, 215, 215, 0 },
6261                 { 219, 215, 216, 4 }, { 215, 219, 217, 4 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 223, 219, 220, 4 }, { 219, 223, 221, 4 }, { 223, 223, 223, 0 }, { 223, 223, 223, 0 },
6262                 { 227, 223, 224, 4 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 231, 227, 228, 4 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
6263                 { 235, 231, 232, 4 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 239, 235, 236, 4 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
6264                 { 243, 239, 240, 4 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 247, 243, 244, 4 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
6265                 { 251, 247, 248, 4 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 255, 251, 252, 4 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
6266             };
6267 
6268             SingleColorTableEntry g_singleColor5_2_p[256] =
6269             {
6270                 { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 },
6271                 { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 },
6272                 { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 },
6273                 { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 41, 32, 17 },
6274                 { 24, 41, 32, 17 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 24, 49, 36, 25 }, { 24, 49, 36, 25 }, { 33, 41, 37, 8 }, { 33, 41, 37, 8 }, { 24, 57, 40, 33 },
6275                 { 24, 57, 40, 33 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 49, 49, 49, 0 },
6276                 { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 57, 57, 57, 0 },
6277                 { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 },
6278                 { 57, 74, 65, 17 }, { 57, 74, 65, 17 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 57, 82, 69, 25 }, { 57, 82, 69, 25 }, { 66, 74, 70, 8 }, { 66, 74, 70, 8 },
6279                 { 57, 90, 73, 33 }, { 57, 90, 73, 33 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 },
6280                 { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 },
6281                 { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 },
6282                 { 90, 99, 94, 9 }, { 90, 107, 98, 17 }, { 90, 107, 98, 17 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 90, 115, 102, 25 }, { 90, 115, 102, 25 }, { 99, 107, 103, 8 },
6283                 { 99, 107, 103, 8 }, { 90, 123, 106, 33 }, { 90, 123, 106, 33 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 111, 8 }, { 107, 115, 111, 8 },
6284                 { 107, 115, 111, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 119, 8 }, { 115, 123, 119, 8 },
6285                 { 115, 123, 119, 8 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 127, 9 }, { 123, 132, 127, 9 },
6286                 { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, { 123, 140, 131, 17 }, { 123, 140, 131, 17 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 123, 148, 135, 25 }, { 123, 148, 135, 25 },
6287                 { 132, 140, 136, 8 }, { 132, 140, 136, 8 }, { 123, 156, 139, 33 }, { 123, 156, 139, 33 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 144, 8 },
6288                 { 140, 148, 144, 8 }, { 140, 148, 144, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 152, 8 },
6289                 { 148, 156, 152, 8 }, { 148, 156, 152, 8 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 160, 9 },
6290                 { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 173, 164, 17 }, { 156, 173, 164, 17 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 156, 181, 168, 25 },
6291                 { 156, 181, 168, 25 }, { 165, 173, 169, 8 }, { 165, 173, 169, 8 }, { 156, 189, 172, 33 }, { 156, 189, 172, 33 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 },
6292                 { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 },
6293                 { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 },
6294                 { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 206, 197, 17 }, { 189, 206, 197, 17 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
6295                 { 189, 214, 201, 25 }, { 189, 214, 201, 25 }, { 198, 206, 202, 8 }, { 198, 206, 202, 8 }, { 189, 222, 205, 33 }, { 189, 222, 205, 33 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
6296                 { 206, 206, 206, 0 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
6297                 { 214, 214, 214, 0 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
6298                 { 222, 222, 222, 0 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 239, 230, 17 }, { 222, 239, 230, 17 }, { 231, 231, 231, 0 },
6299                 { 231, 231, 231, 0 }, { 222, 247, 234, 25 }, { 222, 247, 234, 25 }, { 231, 239, 235, 8 }, { 231, 239, 235, 8 }, { 222, 255, 238, 33 }, { 222, 255, 238, 33 }, { 239, 239, 239, 0 },
6300                 { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
6301                 { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
6302             };
6303 
6304             SingleColorTableEntry g_singleColor6_2_p[256] =
6305             {
6306                 { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
6307                 { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
6308                 { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
6309                 { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
6310                 { 32, 32, 32, 0 }, { 32, 32, 32, 0 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
6311                 { 40, 40, 40, 0 }, { 40, 40, 40, 0 }, { 40, 44, 42, 4 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 48, 46, 4 }, { 48, 48, 48, 0 },
6312                 { 48, 48, 48, 0 }, { 48, 48, 48, 0 }, { 48, 52, 50, 4 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 56, 54, 4 }, { 56, 56, 56, 0 },
6313                 { 56, 56, 56, 0 }, { 56, 56, 56, 0 }, { 56, 60, 58, 4 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 65, 62, 5 }, { 60, 65, 62, 5 },
6314                 { 60, 69, 64, 9 }, { 65, 65, 65, 0 }, { 60, 73, 66, 13 }, { 65, 69, 67, 4 }, { 60, 77, 68, 17 }, { 69, 69, 69, 0 }, { 60, 81, 70, 21 }, { 69, 73, 71, 4 },
6315                 { 60, 85, 72, 25 }, { 73, 73, 73, 0 }, { 60, 89, 74, 29 }, { 73, 77, 75, 4 }, { 60, 93, 76, 33 }, { 77, 77, 77, 0 }, { 77, 77, 77, 0 }, { 77, 81, 79, 4 },
6316                 { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 85, 83, 4 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 89, 87, 4 },
6317                 { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 93, 91, 4 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 95, 4 },
6318                 { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 103, 4 },
6319                 { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 107, 4 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 113, 111, 4 },
6320                 { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 117, 115, 4 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 121, 119, 4 },
6321                 { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 125, 123, 4 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 130, 127, 5 },
6322                 { 125, 130, 127, 5 }, { 125, 134, 129, 9 }, { 130, 130, 130, 0 }, { 125, 138, 131, 13 }, { 130, 134, 132, 4 }, { 125, 142, 133, 17 }, { 134, 134, 134, 0 }, { 125, 146, 135, 21 },
6323                 { 134, 138, 136, 4 }, { 125, 150, 137, 25 }, { 138, 138, 138, 0 }, { 125, 154, 139, 29 }, { 138, 142, 140, 4 }, { 125, 158, 141, 33 }, { 142, 142, 142, 0 }, { 142, 142, 142, 0 },
6324                 { 142, 146, 144, 4 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 150, 148, 4 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 },
6325                 { 150, 154, 152, 4 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 158, 156, 4 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 },
6326                 { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 },
6327                 { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 172, 4 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 },
6328                 { 174, 178, 176, 4 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 182, 180, 4 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 },
6329                 { 182, 186, 184, 4 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 190, 188, 4 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 },
6330                 { 190, 195, 192, 5 }, { 190, 195, 192, 5 }, { 190, 199, 194, 9 }, { 195, 195, 195, 0 }, { 190, 203, 196, 13 }, { 195, 199, 197, 4 }, { 190, 207, 198, 17 }, { 199, 199, 199, 0 },
6331                 { 190, 211, 200, 21 }, { 199, 203, 201, 4 }, { 190, 215, 202, 25 }, { 203, 203, 203, 0 }, { 190, 219, 204, 29 }, { 203, 207, 205, 4 }, { 190, 223, 206, 33 }, { 207, 207, 207, 0 },
6332                 { 207, 207, 207, 0 }, { 207, 211, 209, 4 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 215, 213, 4 }, { 215, 215, 215, 0 }, { 215, 215, 215, 0 },
6333                 { 215, 215, 215, 0 }, { 215, 219, 217, 4 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 223, 221, 4 }, { 223, 223, 223, 0 }, { 223, 223, 223, 0 },
6334                 { 223, 223, 223, 0 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
6335                 { 231, 231, 231, 0 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
6336                 { 239, 239, 239, 0 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
6337                 { 247, 247, 247, 0 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
6338             };
6339         }
6340 
6341         class S3TCComputer
6342         {
6343         public:
6344             typedef ParallelMath::Float MFloat;
6345             typedef ParallelMath::SInt16 MSInt16;
6346             typedef ParallelMath::UInt15 MUInt15;
6347             typedef ParallelMath::UInt16 MUInt16;
6348             typedef ParallelMath::SInt32 MSInt32;
6349 
Init(MFloat & error)6350             static void Init(MFloat& error)
6351             {
6352                 error = ParallelMath::MakeFloat(FLT_MAX);
6353             }
6354 
QuantizeTo6Bits(MUInt15 & v)6355             static void QuantizeTo6Bits(MUInt15& v)
6356             {
6357                 MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(253)) + ParallelMath::MakeUInt16(512), 10));
6358                 v = (reduced << 2) | ParallelMath::RightShift(reduced, 4);
6359             }
6360 
QuantizeTo5Bits(MUInt15 & v)6361             static void QuantizeTo5Bits(MUInt15& v)
6362             {
6363                 MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(249)) + ParallelMath::MakeUInt16(1024), 11));
6364                 v = (reduced << 3) | ParallelMath::RightShift(reduced, 2);
6365             }
6366 
QuantizeTo565(MUInt15 endPoint[3])6367             static void QuantizeTo565(MUInt15 endPoint[3])
6368             {
6369                 QuantizeTo5Bits(endPoint[0]);
6370                 QuantizeTo6Bits(endPoint[1]);
6371                 QuantizeTo5Bits(endPoint[2]);
6372             }
6373 
ParanoidFactorForSpan(const MSInt16 & span)6374             static MFloat ParanoidFactorForSpan(const MSInt16& span)
6375             {
6376                 return ParallelMath::Abs(ParallelMath::ToFloat(span)) * 0.03f;
6377             }
6378 
ParanoidDiff(const MUInt15 & a,const MUInt15 & b,const MFloat & d)6379             static MFloat ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d)
6380             {
6381                 MFloat absDiff = ParallelMath::Abs(ParallelMath::ToFloat(ParallelMath::LosslessCast<MSInt16>::Cast(a) - ParallelMath::LosslessCast<MSInt16>::Cast(b)));
6382                 absDiff = absDiff + d;
6383                 return absDiff * absDiff;
6384             }
6385 
TestSingleColor(uint32_t flags,const MUInt15 pixels[16][4],const MFloat floatPixels[16][4],int range,const float * channelWeights,MFloat & bestError,MUInt15 bestEndpoints[2][3],MUInt15 bestIndexes[16],MUInt15 & bestRange,const ParallelMath::RoundTowardNearestForScope * rtn)6386             static void TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights,
6387                 MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn)
6388             {
6389                 float channelWeightsSq[3];
6390 
6391                 for (int ch = 0; ch < 3; ch++)
6392                     channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
6393 
6394                 MUInt15 totals[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
6395 
6396                 for (int px = 0; px < 16; px++)
6397                 {
6398                     for (int ch = 0; ch < 3; ch++)
6399                         totals[ch] = totals[ch] + pixels[px][ch];
6400                 }
6401 
6402                 MUInt15 average[3];
6403                 for (int ch = 0; ch < 3; ch++)
6404                     average[ch] = ParallelMath::RightShift(totals[ch] + ParallelMath::MakeUInt15(8), 4);
6405 
6406                 const S3TCSingleColorTables::SingleColorTableEntry* rbTable = NULL;
6407                 const S3TCSingleColorTables::SingleColorTableEntry* gTable = NULL;
6408                 if (flags & cvtt::Flags::S3TC_Paranoid)
6409                 {
6410                     if (range == 4)
6411                     {
6412                         rbTable = S3TCSingleColorTables::g_singleColor5_3_p;
6413                         gTable = S3TCSingleColorTables::g_singleColor6_3_p;
6414                     }
6415                     else
6416                     {
6417                         assert(range == 3);
6418                         rbTable = S3TCSingleColorTables::g_singleColor5_2_p;
6419                         gTable = S3TCSingleColorTables::g_singleColor6_2_p;
6420                     }
6421                 }
6422                 else
6423                 {
6424                     if (range == 4)
6425                     {
6426                         rbTable = S3TCSingleColorTables::g_singleColor5_3;
6427                         gTable = S3TCSingleColorTables::g_singleColor6_3;
6428                     }
6429                     else
6430                     {
6431                         assert(range == 3);
6432                         rbTable = S3TCSingleColorTables::g_singleColor5_2;
6433                         gTable = S3TCSingleColorTables::g_singleColor6_2;
6434                     }
6435                 }
6436 
6437                 MUInt15 interpolated[3];
6438                 MUInt15 eps[2][3];
6439                 MSInt16 spans[3];
6440                 for (int i = 0; i < ParallelMath::ParallelSize; i++)
6441                 {
6442                     for (int ch = 0; ch < 3; ch++)
6443                     {
6444                         uint16_t avg = ParallelMath::Extract(average[ch], i);
6445                         const S3TCSingleColorTables::SingleColorTableEntry& tableEntry = ((ch == 1) ? gTable[avg] : rbTable[avg]);
6446                         ParallelMath::PutUInt15(eps[0][ch], i, tableEntry.m_min);
6447                         ParallelMath::PutUInt15(eps[1][ch], i, tableEntry.m_max);
6448                         ParallelMath::PutUInt15(interpolated[ch], i, tableEntry.m_actualColor);
6449                         ParallelMath::PutSInt16(spans[ch], i, tableEntry.m_span);
6450                     }
6451                 }
6452 
6453                 MFloat error = ParallelMath::MakeFloatZero();
6454                 if (flags & cvtt::Flags::S3TC_Paranoid)
6455                 {
6456                     MFloat spanParanoidFactors[3];
6457                     for (int ch = 0; ch < 3; ch++)
6458                         spanParanoidFactors[ch] = ParanoidFactorForSpan(spans[ch]);
6459 
6460                     for (int px = 0; px < 16; px++)
6461                     {
6462                         for (int ch = 0; ch < 3; ch++)
6463                             error = error + ParanoidDiff(interpolated[ch], pixels[px][ch], spanParanoidFactors[ch]) * channelWeightsSq[ch];
6464                     }
6465                 }
6466                 else
6467                 {
6468                     for (int px = 0; px < 16; px++)
6469                     {
6470                         for (int ch = 0; ch < 3; ch++)
6471                             error = error + ParallelMath::ToFloat(ParallelMath::SqDiffUInt8(interpolated[ch], pixels[px][ch])) * channelWeightsSq[ch];
6472                     }
6473                 }
6474 
6475                 ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
6476                 ParallelMath::Int16CompFlag better16 = ParallelMath::FloatFlagToInt16(better);
6477 
6478                 if (ParallelMath::AnySet(better16))
6479                 {
6480                     bestError = ParallelMath::Min(bestError, error);
6481                     for (int epi = 0; epi < 2; epi++)
6482                         for (int ch = 0; ch < 3; ch++)
6483                             ParallelMath::ConditionalSet(bestEndpoints[epi][ch], better16, eps[epi][ch]);
6484 
6485                     MUInt15 vindexes = ParallelMath::MakeUInt15(1);
6486                     for (int px = 0; px < 16; px++)
6487                         ParallelMath::ConditionalSet(bestIndexes[px], better16, vindexes);
6488 
6489                     ParallelMath::ConditionalSet(bestRange, better16, ParallelMath::MakeUInt15(range));
6490                 }
6491             }
6492 
TestEndpoints(uint32_t flags,const MUInt15 pixels[16][4],const MFloat floatPixels[16][4],const MFloat preWeightedPixels[16][4],const MUInt15 unquantizedEndPoints[2][3],int range,const float * channelWeights,MFloat & bestError,MUInt15 bestEndpoints[2][3],MUInt15 bestIndexes[16],MUInt15 & bestRange,EndpointRefiner<3> * refiner,const ParallelMath::RoundTowardNearestForScope * rtn)6493             static void TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights,
6494                 MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn)
6495             {
6496                 float channelWeightsSq[3];
6497 
6498                 for (int ch = 0; ch < 3; ch++)
6499                     channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
6500 
6501                 MUInt15 endPoints[2][3];
6502 
6503                 for (int ep = 0; ep < 2; ep++)
6504                     for (int ch = 0; ch < 3; ch++)
6505                         endPoints[ep][ch] = unquantizedEndPoints[ep][ch];
6506 
6507                 QuantizeTo565(endPoints[0]);
6508                 QuantizeTo565(endPoints[1]);
6509 
6510                 IndexSelector<3> selector;
6511                 selector.Init<false>(channelWeights, endPoints, range);
6512 
6513                 MUInt15 indexes[16];
6514 
6515                 MFloat paranoidFactors[3];
6516                 for (int ch = 0; ch < 3; ch++)
6517                     paranoidFactors[ch] = ParanoidFactorForSpan(ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[0][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[1][ch]));
6518 
6519                 MFloat error = ParallelMath::MakeFloatZero();
6520                 AggregatedError<3> aggError;
6521                 for (int px = 0; px < 16; px++)
6522                 {
6523                     MUInt15 index = selector.SelectIndexLDR(floatPixels[px], rtn);
6524                     indexes[px] = index;
6525 
6526                     if (refiner)
6527                         refiner->ContributeUnweightedPW(preWeightedPixels[px], index);
6528 
6529                     MUInt15 reconstructed[3];
6530                     selector.ReconstructLDRPrecise(index, reconstructed);
6531 
6532                     if (flags & Flags::S3TC_Paranoid)
6533                     {
6534                         for (int ch = 0; ch < 3; ch++)
6535                             error = error + ParanoidDiff(reconstructed[ch], pixels[px][ch], paranoidFactors[ch]) * channelWeightsSq[ch];
6536                     }
6537                     else
6538                         BCCommon::ComputeErrorLDR<3>(flags, reconstructed, pixels[px], aggError);
6539                 }
6540 
6541                 if (!(flags & Flags::S3TC_Paranoid))
6542                     error = aggError.Finalize(flags, channelWeightsSq);
6543 
6544                 ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
6545 
6546                 if (ParallelMath::AnySet(better))
6547                 {
6548                     ParallelMath::Int16CompFlag betterInt16 = ParallelMath::FloatFlagToInt16(better);
6549 
6550                     ParallelMath::ConditionalSet(bestError, better, error);
6551 
6552                     for (int ep = 0; ep < 2; ep++)
6553                         for (int ch = 0; ch < 3; ch++)
6554                             ParallelMath::ConditionalSet(bestEndpoints[ep][ch], betterInt16, endPoints[ep][ch]);
6555 
6556                     for (int px = 0; px < 16; px++)
6557                         ParallelMath::ConditionalSet(bestIndexes[px], betterInt16, indexes[px]);
6558 
6559                     ParallelMath::ConditionalSet(bestRange, betterInt16, ParallelMath::MakeUInt15(static_cast<uint16_t>(range)));
6560                 }
6561             }
6562 
TestCounts(uint32_t flags,const int * counts,int nCounts,const MUInt15 & numElements,const MUInt15 pixels[16][4],const MFloat floatPixels[16][4],const MFloat preWeightedPixels[16][4],bool alphaTest,const MFloat floatSortedInputs[16][4],const MFloat preWeightedFloatSortedInputs[16][4],const float * channelWeights,MFloat & bestError,MUInt15 bestEndpoints[2][3],MUInt15 bestIndexes[16],MUInt15 & bestRange,const ParallelMath::RoundTowardNearestForScope * rtn)6563             static void TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest,
6564                 const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange,
6565                 const ParallelMath::RoundTowardNearestForScope* rtn)
6566             {
6567                 UNREFERENCED_PARAMETER(alphaTest);
6568                 UNREFERENCED_PARAMETER(flags);
6569 
6570                 EndpointRefiner<3> refiner;
6571 
6572                 refiner.Init(nCounts, channelWeights);
6573 
6574                 bool escape = false;
6575                 int e = 0;
6576                 for (int i = 0; i < nCounts; i++)
6577                 {
6578                     for (int n = 0; n < counts[i]; n++)
6579                     {
6580                         ParallelMath::Int16CompFlag valid = ParallelMath::Less(ParallelMath::MakeUInt15(static_cast<uint16_t>(n)), numElements);
6581                         if (!ParallelMath::AnySet(valid))
6582                         {
6583                             escape = true;
6584                             break;
6585                         }
6586 
6587                         if (ParallelMath::AllSet(valid))
6588                             refiner.ContributeUnweightedPW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
6589                         else
6590                         {
6591                             MFloat weight = ParallelMath::Select(ParallelMath::Int16FlagToFloat(valid), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(0.0f));
6592                             refiner.ContributePW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), weight);
6593                         }
6594                     }
6595 
6596                     if (escape)
6597                         break;
6598                 }
6599 
6600                 MUInt15 endPoints[2][3];
6601                 refiner.GetRefinedEndpointsLDR(endPoints, rtn);
6602 
6603                 TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn);
6604             }
6605 
PackExplicitAlpha(uint32_t flags,const PixelBlockU8 * inputs,int inputChannel,uint8_t * packedBlocks,size_t packedBlockStride)6606             static void PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride)
6607             {
6608                 UNREFERENCED_PARAMETER(flags);
6609                 ParallelMath::RoundTowardNearestForScope rtn;
6610 
6611                 float weights[1] = { 1.0f };
6612 
6613                 MUInt15 pixels[16];
6614                 MFloat floatPixels[16];
6615 
6616                 for (int px = 0; px < 16; px++)
6617                 {
6618                     ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
6619                     floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
6620                 }
6621 
6622                 MUInt15 ep[2][1] = { { ParallelMath::MakeUInt15(0) },{ ParallelMath::MakeUInt15(255) } };
6623 
6624                 IndexSelector<1> selector;
6625                 selector.Init<false>(weights, ep, 16);
6626 
6627                 MUInt15 indexes[16];
6628 
6629                 for (int px = 0; px < 16; px++)
6630                     indexes[px] = selector.SelectIndexLDR(&floatPixels[px], &rtn);
6631 
6632                 for (int block = 0; block < ParallelMath::ParallelSize; block++)
6633                 {
6634                     for (int px = 0; px < 16; px += 8)
6635                     {
6636                         int index0 = ParallelMath::Extract(indexes[px], block);
6637                         int index1 = ParallelMath::Extract(indexes[px], block);
6638 
6639                         packedBlocks[px / 2] = static_cast<uint8_t>(index0 | (index1 << 4));
6640                     }
6641 
6642                     packedBlocks += packedBlockStride;
6643                 }
6644             }
6645 
PackInterpolatedAlpha(uint32_t flags,const PixelBlockU8 * inputs,int inputChannel,uint8_t * packedBlocks,size_t packedBlockStride,bool isSigned,int maxTweakRounds,int numRefineRounds)6646             static void PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds)
6647             {
6648                 if (maxTweakRounds < 1)
6649                     maxTweakRounds = 1;
6650 
6651                 if (numRefineRounds < 1)
6652                     numRefineRounds = 1;
6653 
6654                 ParallelMath::RoundTowardNearestForScope rtn;
6655 
6656                 float oneWeight[1] = { 1.0f };
6657 
6658                 MUInt15 pixels[16];
6659                 MFloat floatPixels[16];
6660 
6661                 MUInt15 highTerminal = isSigned ? ParallelMath::MakeUInt15(254) : ParallelMath::MakeUInt15(255);
6662                 MUInt15 highTerminalMinusOne = highTerminal - ParallelMath::MakeUInt15(1);
6663 
6664                 for (int px = 0; px < 16; px++)
6665                 {
6666                     ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
6667 
6668                     if (isSigned)
6669                         pixels[px] = ParallelMath::Min(pixels[px], highTerminal);
6670 
6671                     floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
6672                 }
6673 
6674                 MUInt15 sortedPixels[16];
6675                 for (int px = 0; px < 16; px++)
6676                     sortedPixels[px] = pixels[px];
6677 
6678                 for (int sortEnd = 15; sortEnd > 0; sortEnd--)
6679                 {
6680                     for (int sortOffset = 0; sortOffset < sortEnd; sortOffset++)
6681                     {
6682                         MUInt15 a = sortedPixels[sortOffset];
6683                         MUInt15 b = sortedPixels[sortOffset + 1];
6684 
6685                         sortedPixels[sortOffset] = ParallelMath::Min(a, b);
6686                         sortedPixels[sortOffset + 1] = ParallelMath::Max(a, b);
6687                     }
6688                 }
6689 
6690                 MUInt15 zero = ParallelMath::MakeUInt15(0);
6691                 MUInt15 one = ParallelMath::MakeUInt15(1);
6692 
6693                 MUInt15 bestIsFullRange = zero;
6694                 MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
6695                 MUInt15 bestEP[2] = { zero, zero };
6696                 MUInt15 bestIndexes[16] = {
6697                     zero, zero, zero, zero,
6698                     zero, zero, zero, zero,
6699                     zero, zero, zero, zero,
6700                     zero, zero, zero, zero
6701                 };
6702 
6703                 // Full-precision
6704                 {
6705                     MUInt15 minEP = sortedPixels[0];
6706                     MUInt15 maxEP = sortedPixels[15];
6707 
6708                     MFloat base[1] = { ParallelMath::ToFloat(minEP) };
6709                     MFloat offset[1] = { ParallelMath::ToFloat(maxEP - minEP) };
6710 
6711                     UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
6712 
6713                     int numTweakRounds = BCCommon::TweakRoundsForRange(8);
6714                     if (numTweakRounds > maxTweakRounds)
6715                         numTweakRounds = maxTweakRounds;
6716 
6717                     for (int tweak = 0; tweak < numTweakRounds; tweak++)
6718                     {
6719                         MUInt15 ep[2][1];
6720 
6721                         ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
6722 
6723                         for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
6724                         {
6725                             EndpointRefiner<1> refiner;
6726                             refiner.Init(8, oneWeight);
6727 
6728                             if (isSigned)
6729                                 for (int epi = 0; epi < 2; epi++)
6730                                     ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
6731 
6732                             IndexSelector<1> indexSelector;
6733                             indexSelector.Init<false>(oneWeight, ep, 8);
6734 
6735                             MUInt15 indexes[16];
6736 
6737                             AggregatedError<1> aggError;
6738                             for (int px = 0; px < 16; px++)
6739                             {
6740                                 MUInt15 index = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
6741 
6742                                 MUInt15 reconstructedPixel;
6743 
6744                                 indexSelector.ReconstructLDRPrecise(index, &reconstructedPixel);
6745                                 BCCommon::ComputeErrorLDR<1>(flags, &reconstructedPixel, &pixels[px], aggError);
6746 
6747                                 if (refinePass != numRefineRounds - 1)
6748                                     refiner.ContributeUnweightedPW(&floatPixels[px], index);
6749 
6750                                 indexes[px] = index;
6751                             }
6752                             MFloat error = aggError.Finalize(flags | Flags::Uniform, oneWeight);
6753 
6754                             ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
6755                             ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
6756 
6757                             if (ParallelMath::AnySet(errorBetter16))
6758                             {
6759                                 bestError = ParallelMath::Min(error, bestError);
6760                                 ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, one);
6761                                 for (int px = 0; px < 16; px++)
6762                                     ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
6763 
6764                                 for (int epi = 0; epi < 2; epi++)
6765                                     ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
6766                             }
6767 
6768                             if (refinePass != numRefineRounds - 1)
6769                                 refiner.GetRefinedEndpointsLDR(ep, &rtn);
6770                         }
6771                     }
6772                 }
6773 
6774                 // Reduced precision with special endpoints
6775                 {
6776                     MUInt15 bestHeuristicMin = sortedPixels[0];
6777                     MUInt15 bestHeuristicMax = sortedPixels[15];
6778 
6779                     ParallelMath::Int16CompFlag canTryClipping;
6780 
6781                     // In reduced precision, we want try putting endpoints at the reserved indexes at the ends.
6782                     // The heuristic we use is to assign indexes to the end as long as they aren't off by more than half of the index range.
6783                     // This will usually not find anything, but it's cheap to check.
6784 
6785                     {
6786                         MUInt15 largestPossibleRange = bestHeuristicMax - bestHeuristicMin; // Max: 255
6787                         MUInt15 lowestPossibleClearance = ParallelMath::Min(bestHeuristicMin, static_cast<MUInt15>(highTerminal - bestHeuristicMax));
6788 
6789                         MUInt15 lowestPossibleClearanceTimes10 = (lowestPossibleClearance << 2) + (lowestPossibleClearance << 4);
6790                         canTryClipping = ParallelMath::LessOrEqual(lowestPossibleClearanceTimes10, largestPossibleRange);
6791                     }
6792 
6793                     if (ParallelMath::AnySet(canTryClipping))
6794                     {
6795                         MUInt15 lowClearances[16];
6796                         MUInt15 highClearances[16];
6797                         MUInt15 bestSkipCount = ParallelMath::MakeUInt15(0);
6798 
6799                         lowClearances[0] = highClearances[0] = ParallelMath::MakeUInt15(0);
6800 
6801                         for (int px = 1; px < 16; px++)
6802                         {
6803                             lowClearances[px] = sortedPixels[px - 1];
6804                             highClearances[px] = highTerminal - sortedPixels[16 - px];
6805                         }
6806 
6807                         for (uint16_t firstIndex = 0; firstIndex < 16; firstIndex++)
6808                         {
6809                             uint16_t numSkippedLow = firstIndex;
6810 
6811                             MUInt15 lowClearance = lowClearances[firstIndex];
6812 
6813                             for (uint16_t lastIndex = firstIndex; lastIndex < 16; lastIndex++)
6814                             {
6815                                 uint16_t numSkippedHigh = 15 - lastIndex;
6816                                 uint16_t numSkipped = numSkippedLow + numSkippedHigh;
6817 
6818                                 MUInt15 numSkippedV = ParallelMath::MakeUInt15(numSkipped);
6819 
6820                                 ParallelMath::Int16CompFlag areMoreSkipped = ParallelMath::Less(bestSkipCount, numSkippedV);
6821 
6822                                 if (!ParallelMath::AnySet(areMoreSkipped))
6823                                     continue;
6824 
6825                                 MUInt15 clearance = ParallelMath::Max(highClearances[numSkippedHigh], lowClearance);
6826                                 MUInt15 clearanceTimes10 = (clearance << 2) + (clearance << 4);
6827 
6828                                 MUInt15 range = sortedPixels[lastIndex] - sortedPixels[firstIndex];
6829 
6830                                 ParallelMath::Int16CompFlag isBetter = (areMoreSkipped & ParallelMath::LessOrEqual(clearanceTimes10, range));
6831                                 ParallelMath::ConditionalSet(bestHeuristicMin, isBetter, sortedPixels[firstIndex]);
6832                                 ParallelMath::ConditionalSet(bestHeuristicMax, isBetter, sortedPixels[lastIndex]);
6833                             }
6834                         }
6835                     }
6836 
6837                     MUInt15 bestSimpleMin = one;
6838                     MUInt15 bestSimpleMax = highTerminalMinusOne;
6839 
6840                     for (int px = 0; px < 16; px++)
6841                     {
6842                         ParallelMath::ConditionalSet(bestSimpleMin, ParallelMath::Less(zero, sortedPixels[15 - px]), sortedPixels[15 - px]);
6843                         ParallelMath::ConditionalSet(bestSimpleMax, ParallelMath::Less(sortedPixels[px], highTerminal), sortedPixels[px]);
6844                     }
6845 
6846                     MUInt15 minEPs[2] = { bestSimpleMin, bestHeuristicMin };
6847                     MUInt15 maxEPs[2] = { bestSimpleMax, bestHeuristicMax };
6848 
6849                     int minEPRange = 2;
6850                     if (ParallelMath::AllSet(ParallelMath::Equal(minEPs[0], minEPs[1])))
6851                         minEPRange = 1;
6852 
6853                     int maxEPRange = 2;
6854                     if (ParallelMath::AllSet(ParallelMath::Equal(maxEPs[0], maxEPs[1])))
6855                         maxEPRange = 1;
6856 
6857                     for (int minEPIndex = 0; minEPIndex < minEPRange; minEPIndex++)
6858                     {
6859                         for (int maxEPIndex = 0; maxEPIndex < maxEPRange; maxEPIndex++)
6860                         {
6861                             MFloat base[1] = { ParallelMath::ToFloat(minEPs[minEPIndex]) };
6862                             MFloat offset[1] = { ParallelMath::ToFloat(maxEPs[maxEPIndex] - minEPs[minEPIndex]) };
6863 
6864                             UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
6865 
6866                             int numTweakRounds = BCCommon::TweakRoundsForRange(6);
6867                             if (numTweakRounds > maxTweakRounds)
6868                                 numTweakRounds = maxTweakRounds;
6869 
6870                             for (int tweak = 0; tweak < numTweakRounds; tweak++)
6871                             {
6872                                 MUInt15 ep[2][1];
6873 
6874                                 ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
6875 
6876                                 for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
6877                                 {
6878                                     EndpointRefiner<1> refiner;
6879                                     refiner.Init(6, oneWeight);
6880 
6881                                     if (isSigned)
6882                                         for (int epi = 0; epi < 2; epi++)
6883                                             ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
6884 
6885                                     IndexSelector<1> indexSelector;
6886                                     indexSelector.Init<false>(oneWeight, ep, 6);
6887 
6888                                     MUInt15 indexes[16];
6889                                     MFloat error = ParallelMath::MakeFloatZero();
6890 
6891                                     for (int px = 0; px < 16; px++)
6892                                     {
6893                                         MUInt15 selectedIndex = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
6894 
6895                                         MUInt15 reconstructedPixel;
6896 
6897                                         indexSelector.ReconstructLDRPrecise(selectedIndex, &reconstructedPixel);
6898 
6899                                         MFloat zeroError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &zero, &pixels[px], 1, oneWeight);
6900                                         MFloat highTerminalError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &highTerminal, &pixels[px], 1, oneWeight);
6901                                         MFloat selectedIndexError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &reconstructedPixel, &pixels[px], 1, oneWeight);
6902 
6903                                         MFloat bestPixelError = zeroError;
6904                                         MUInt15 index = ParallelMath::MakeUInt15(6);
6905 
6906                                         ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(ParallelMath::Less(highTerminalError, bestPixelError)), ParallelMath::MakeUInt15(7));
6907                                         bestPixelError = ParallelMath::Min(bestPixelError, highTerminalError);
6908 
6909                                         ParallelMath::FloatCompFlag selectedIndexBetter = ParallelMath::Less(selectedIndexError, bestPixelError);
6910 
6911                                         if (ParallelMath::AllSet(selectedIndexBetter))
6912                                         {
6913                                             if (refinePass != numRefineRounds - 1)
6914                                                 refiner.ContributeUnweightedPW(&floatPixels[px], selectedIndex);
6915                                         }
6916                                         else
6917                                         {
6918                                             MFloat refineWeight = ParallelMath::Select(selectedIndexBetter, ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloatZero());
6919 
6920                                             if (refinePass != numRefineRounds - 1)
6921                                                 refiner.ContributePW(&floatPixels[px], selectedIndex, refineWeight);
6922                                         }
6923 
6924                                         ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(selectedIndexBetter), selectedIndex);
6925                                         bestPixelError = ParallelMath::Min(bestPixelError, selectedIndexError);
6926 
6927                                         error = error + bestPixelError;
6928 
6929                                         indexes[px] = index;
6930                                     }
6931 
6932                                     ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
6933                                     ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
6934 
6935                                     if (ParallelMath::AnySet(errorBetter16))
6936                                     {
6937                                         bestError = ParallelMath::Min(error, bestError);
6938                                         ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, zero);
6939                                         for (int px = 0; px < 16; px++)
6940                                             ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
6941 
6942                                         for (int epi = 0; epi < 2; epi++)
6943                                             ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
6944                                     }
6945 
6946                                     if (refinePass != numRefineRounds - 1)
6947                                         refiner.GetRefinedEndpointsLDR(ep, &rtn);
6948                                 }
6949                             }
6950                         }
6951                     }
6952                 }
6953 
6954                 for (int block = 0; block < ParallelMath::ParallelSize; block++)
6955                 {
6956                     int ep0 = ParallelMath::Extract(bestEP[0], block);
6957                     int ep1 = ParallelMath::Extract(bestEP[1], block);
6958                     int isFullRange = ParallelMath::Extract(bestIsFullRange, block);
6959 
6960                     if (isSigned)
6961                     {
6962                         ep0 -= 127;
6963                         ep1 -= 127;
6964 
6965                         assert(ep0 >= -127 && ep0 <= 127);
6966                         assert(ep1 >= -127 && ep1 <= 127);
6967                     }
6968 
6969 
6970                     bool swapEndpoints = (isFullRange != 0) != (ep0 > ep1);
6971 
6972                     if (swapEndpoints)
6973                         std::swap(ep0, ep1);
6974 
6975                     uint16_t dumpBits = 0;
6976                     int dumpBitsOffset = 0;
6977                     int dumpByteOffset = 2;
6978                     packedBlocks[0] = static_cast<uint8_t>(ep0 & 0xff);
6979                     packedBlocks[1] = static_cast<uint8_t>(ep1 & 0xff);
6980 
6981                     int maxValue = (isFullRange != 0) ? 7 : 5;
6982 
6983                     for (int px = 0; px < 16; px++)
6984                     {
6985                         int index = ParallelMath::Extract(bestIndexes[px], block);
6986 
6987                         if (swapEndpoints && index <= maxValue)
6988                             index = maxValue - index;
6989 
6990                         if (index != 0)
6991                         {
6992                             if (index == maxValue)
6993                                 index = 1;
6994                             else if (index < maxValue)
6995                                 index++;
6996                         }
6997 
6998                         assert(index >= 0 && index < 8);
6999 
7000                         dumpBits |= static_cast<uint16_t>(index << dumpBitsOffset);
7001                         dumpBitsOffset += 3;
7002 
7003                         if (dumpBitsOffset >= 8)
7004                         {
7005                             assert(dumpByteOffset < 8);
7006                             packedBlocks[dumpByteOffset] = static_cast<uint8_t>(dumpBits & 0xff);
7007                             dumpBits >>= 8;
7008                             dumpBitsOffset -= 8;
7009                             dumpByteOffset++;
7010                         }
7011                     }
7012 
7013                     assert(dumpBitsOffset == 0);
7014                     assert(dumpByteOffset == 8);
7015 
7016                     packedBlocks += packedBlockStride;
7017                 }
7018             }
7019 
PackRGB(uint32_t flags,const PixelBlockU8 * inputs,uint8_t * packedBlocks,size_t packedBlockStride,const float channelWeights[4],bool alphaTest,float alphaThreshold,bool exhaustive,int maxTweakRounds,int numRefineRounds)7020             static void PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds)
7021             {
7022                 ParallelMath::RoundTowardNearestForScope rtn;
7023 
7024                 if (numRefineRounds < 1)
7025                     numRefineRounds = 1;
7026 
7027                 if (maxTweakRounds < 1)
7028                     maxTweakRounds = 1;
7029 
7030                 EndpointSelector<3, 8> endpointSelector;
7031 
7032                 MUInt15 pixels[16][4];
7033                 MFloat floatPixels[16][4];
7034 
7035                 MFloat preWeightedPixels[16][4];
7036 
7037                 for (int px = 0; px < 16; px++)
7038                 {
7039                     for (int ch = 0; ch < 4; ch++)
7040                         ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
7041                 }
7042 
7043                 for (int px = 0; px < 16; px++)
7044                 {
7045                     for (int ch = 0; ch < 4; ch++)
7046                         floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
7047                 }
7048 
7049                 if (alphaTest)
7050                 {
7051                     MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(floor(alphaThreshold * 255.0f + 0.5f)));
7052 
7053                     for (int px = 0; px < 16; px++)
7054                     {
7055                         ParallelMath::Int16CompFlag belowThreshold = ParallelMath::Less(pixels[px][3], threshold);
7056                         pixels[px][3] = ParallelMath::Select(belowThreshold, ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(255));
7057                     }
7058                 }
7059 
7060                 BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
7061 
7062                 MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
7063 
7064                 for (int px = 0; px < 16; px++)
7065                     minAlpha = ParallelMath::Min(minAlpha, pixels[px][3]);
7066 
7067                 MFloat pixelWeights[16];
7068                 for (int px = 0; px < 16; px++)
7069                 {
7070                     pixelWeights[px] = ParallelMath::MakeFloat(1.0f);
7071                     if (alphaTest)
7072                     {
7073                         ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
7074 
7075                         ParallelMath::ConditionalSet(pixelWeights[px], ParallelMath::Int16FlagToFloat(isTransparent), ParallelMath::MakeFloatZero());
7076                     }
7077                 }
7078 
7079                 for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
7080                 {
7081                     for (int px = 0; px < 16; px++)
7082                         endpointSelector.ContributePass(preWeightedPixels[px], pass, pixelWeights[px]);
7083 
7084                     endpointSelector.FinishPass(pass);
7085                 }
7086 
7087                 UnfinishedEndpoints<3> ufep = endpointSelector.GetEndpoints(channelWeights);
7088 
7089                 MUInt15 bestEndpoints[2][3];
7090                 MUInt15 bestIndexes[16];
7091                 MUInt15 bestRange = ParallelMath::MakeUInt15(0);
7092                 MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
7093 
7094                 for (int px = 0; px < 16; px++)
7095                     bestIndexes[px] = ParallelMath::MakeUInt15(0);
7096 
7097                 for (int ep = 0; ep < 2; ep++)
7098                     for (int ch = 0; ch < 3; ch++)
7099                         bestEndpoints[ep][ch] = ParallelMath::MakeUInt15(0);
7100 
7101                 if (exhaustive)
7102                 {
7103                     MSInt16 sortBins[16];
7104 
7105                     {
7106                         // Compute an 11-bit index, change it to signed, stuff it in the high bits of the sort bins,
7107                         // and pack the original indexes into the low bits.
7108 
7109                         MUInt15 sortEP[2][3];
7110                         ufep.FinishLDR(0, 11, sortEP[0], sortEP[1]);
7111 
7112                         IndexSelector<3> sortSelector;
7113                         sortSelector.Init<false>(channelWeights, sortEP, 1 << 11);
7114 
7115                         for (int16_t px = 0; px < 16; px++)
7116                         {
7117                             MSInt16 sortBin = ParallelMath::LosslessCast<MSInt16>::Cast(sortSelector.SelectIndexLDR(floatPixels[px], &rtn) << 4);
7118 
7119                             if (alphaTest)
7120                             {
7121                                 ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
7122 
7123                                 ParallelMath::ConditionalSet(sortBin, isTransparent, ParallelMath::MakeSInt16(-16)); // 0xfff0
7124                             }
7125 
7126                             sortBin = sortBin + ParallelMath::MakeSInt16(px);
7127 
7128                             sortBins[px] = sortBin;
7129                         }
7130                     }
7131 
7132                     // Sort bins
7133                     for (int sortEnd = 1; sortEnd < 16; sortEnd++)
7134                     {
7135                         for (int sortLoc = sortEnd; sortLoc > 0; sortLoc--)
7136                         {
7137                             MSInt16 a = sortBins[sortLoc];
7138                             MSInt16 b = sortBins[sortLoc - 1];
7139 
7140                             sortBins[sortLoc] = ParallelMath::Max(a, b);
7141                             sortBins[sortLoc - 1] = ParallelMath::Min(a, b);
7142                         }
7143                     }
7144 
7145                     MUInt15 firstElement = ParallelMath::MakeUInt15(0);
7146                     for (uint16_t e = 0; e < 16; e++)
7147                     {
7148                         ParallelMath::Int16CompFlag isInvalid = ParallelMath::Less(sortBins[e], ParallelMath::MakeSInt16(0));
7149                         ParallelMath::ConditionalSet(firstElement, isInvalid, ParallelMath::MakeUInt15(e + 1));
7150                         if (!ParallelMath::AnySet(isInvalid))
7151                             break;
7152                     }
7153 
7154                     MUInt15 numElements = ParallelMath::MakeUInt15(16) - firstElement;
7155 
7156                     MUInt15 sortedInputs[16][4];
7157                     MFloat floatSortedInputs[16][4];
7158                     MFloat pwFloatSortedInputs[16][4];
7159 
7160                     for (int e = 0; e < 16; e++)
7161                     {
7162                         for (int ch = 0; ch < 4; ch++)
7163                             sortedInputs[e][ch] = ParallelMath::MakeUInt15(0);
7164                     }
7165 
7166                     for (int block = 0; block < ParallelMath::ParallelSize; block++)
7167                     {
7168                         for (int e = ParallelMath::Extract(firstElement, block); e < 16; e++)
7169                         {
7170                             ParallelMath::ScalarUInt16 sortBin = ParallelMath::Extract(sortBins[e], block);
7171                             int originalIndex = (sortBin & 15);
7172 
7173                             for (int ch = 0; ch < 4; ch++)
7174                                 ParallelMath::PutUInt15(sortedInputs[15 - e][ch], block, ParallelMath::Extract(pixels[originalIndex][ch], block));
7175                         }
7176                     }
7177 
7178                     for (int e = 0; e < 16; e++)
7179                     {
7180                         for (int ch = 0; ch < 4; ch++)
7181                         {
7182                             MFloat f = ParallelMath::ToFloat(sortedInputs[e][ch]);
7183                             floatSortedInputs[e][ch] = f;
7184                             pwFloatSortedInputs[e][ch] = f * channelWeights[ch];
7185                         }
7186                     }
7187 
7188                     for (int n0 = 0; n0 <= 15; n0++)
7189                     {
7190                         int remainingFor1 = 16 - n0;
7191                         if (remainingFor1 == 16)
7192                             remainingFor1 = 15;
7193 
7194                         for (int n1 = 0; n1 <= remainingFor1; n1++)
7195                         {
7196                             int remainingFor2 = 16 - n1 - n0;
7197                             if (remainingFor2 == 16)
7198                                 remainingFor2 = 15;
7199 
7200                             for (int n2 = 0; n2 <= remainingFor2; n2++)
7201                             {
7202                                 int n3 = 16 - n2 - n1 - n0;
7203 
7204                                 if (n3 == 16)
7205                                     continue;
7206 
7207                                 int counts[4] = { n0, n1, n2, n3 };
7208 
7209                                 TestCounts(flags, counts, 4, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
7210                             }
7211                         }
7212                     }
7213 
7214                     TestSingleColor(flags, pixels, floatPixels, 4, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
7215 
7216                     if (alphaTest)
7217                     {
7218                         for (int n0 = 0; n0 <= 15; n0++)
7219                         {
7220                             int remainingFor1 = 16 - n0;
7221                             if (remainingFor1 == 16)
7222                                 remainingFor1 = 15;
7223 
7224                             for (int n1 = 0; n1 <= remainingFor1; n1++)
7225                             {
7226                                 int n2 = 16 - n1 - n0;
7227 
7228                                 if (n2 == 16)
7229                                     continue;
7230 
7231                                 int counts[3] = { n0, n1, n2 };
7232 
7233                                 TestCounts(flags, counts, 3, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
7234                             }
7235                         }
7236 
7237                         TestSingleColor(flags, pixels, floatPixels, 3, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
7238                     }
7239                 }
7240                 else
7241                 {
7242                     int minRange = alphaTest ? 3 : 4;
7243 
7244                     for (int range = minRange; range <= 4; range++)
7245                     {
7246                         int tweakRounds = BCCommon::TweakRoundsForRange(range);
7247                         if (tweakRounds > maxTweakRounds)
7248                             tweakRounds = maxTweakRounds;
7249 
7250                         for (int tweak = 0; tweak < tweakRounds; tweak++)
7251                         {
7252                             MUInt15 endPoints[2][3];
7253 
7254                             ufep.FinishLDR(tweak, range, endPoints[0], endPoints[1]);
7255 
7256                             for (int refine = 0; refine < numRefineRounds; refine++)
7257                             {
7258                                 EndpointRefiner<3> refiner;
7259                                 refiner.Init(range, channelWeights);
7260 
7261                                 TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, range, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &refiner, &rtn);
7262 
7263                                 if (refine != numRefineRounds - 1)
7264                                     refiner.GetRefinedEndpointsLDR(endPoints, &rtn);
7265                             }
7266                         }
7267                     }
7268                 }
7269 
7270                 for (int block = 0; block < ParallelMath::ParallelSize; block++)
7271                 {
7272                     ParallelMath::ScalarUInt16 range = ParallelMath::Extract(bestRange, block);
7273                     assert(range == 3 || range == 4);
7274 
7275                     ParallelMath::ScalarUInt16 compressedEP[2];
7276                     for (int ep = 0; ep < 2; ep++)
7277                     {
7278                         ParallelMath::ScalarUInt16 endPoint[3];
7279                         for (int ch = 0; ch < 3; ch++)
7280                             endPoint[ch] = ParallelMath::Extract(bestEndpoints[ep][ch], block);
7281 
7282                         int compressed = (endPoint[0] & 0xf8) << 8;
7283                         compressed |= (endPoint[1] & 0xfc) << 3;
7284                         compressed |= (endPoint[2] & 0xf8) >> 3;
7285 
7286                         compressedEP[ep] = static_cast<ParallelMath::ScalarUInt16>(compressed);
7287                     }
7288 
7289                     int indexOrder[4];
7290 
7291                     if (range == 4)
7292                     {
7293                         if (compressedEP[0] == compressedEP[1])
7294                         {
7295                             indexOrder[0] = 0;
7296                             indexOrder[1] = 0;
7297                             indexOrder[2] = 0;
7298                             indexOrder[3] = 0;
7299                         }
7300                         else if (compressedEP[0] < compressedEP[1])
7301                         {
7302                             std::swap(compressedEP[0], compressedEP[1]);
7303                             indexOrder[0] = 1;
7304                             indexOrder[1] = 3;
7305                             indexOrder[2] = 2;
7306                             indexOrder[3] = 0;
7307                         }
7308                         else
7309                         {
7310                             indexOrder[0] = 0;
7311                             indexOrder[1] = 2;
7312                             indexOrder[2] = 3;
7313                             indexOrder[3] = 1;
7314                         }
7315                     }
7316                     else
7317                     {
7318                         assert(range == 3);
7319 
7320                         if (compressedEP[0] > compressedEP[1])
7321                         {
7322                             std::swap(compressedEP[0], compressedEP[1]);
7323                             indexOrder[0] = 1;
7324                             indexOrder[1] = 2;
7325                             indexOrder[2] = 0;
7326                         }
7327                         else
7328                         {
7329                             indexOrder[0] = 0;
7330                             indexOrder[1] = 2;
7331                             indexOrder[2] = 1;
7332                         }
7333                         indexOrder[3] = 3;
7334                     }
7335 
7336                     packedBlocks[0] = static_cast<uint8_t>(compressedEP[0] & 0xff);
7337                     packedBlocks[1] = static_cast<uint8_t>((compressedEP[0] >> 8) & 0xff);
7338                     packedBlocks[2] = static_cast<uint8_t>(compressedEP[1] & 0xff);
7339                     packedBlocks[3] = static_cast<uint8_t>((compressedEP[1] >> 8) & 0xff);
7340 
7341                     for (int i = 0; i < 16; i += 4)
7342                     {
7343                         int packedIndexes = 0;
7344                         for (int subi = 0; subi < 4; subi++)
7345                         {
7346                             ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[i + subi], block);
7347                             packedIndexes |= (indexOrder[index] << (subi * 2));
7348                         }
7349 
7350                         packedBlocks[4 + i / 4] = static_cast<uint8_t>(packedIndexes);
7351                     }
7352 
7353                     packedBlocks += packedBlockStride;
7354                 }
7355             }
7356         };
7357 
7358         // Signed input blocks are converted into unsigned space, with the maximum value being 254
BiasSignedInput(PixelBlockU8 inputNormalized[ParallelMath::ParallelSize],const PixelBlockS8 inputSigned[ParallelMath::ParallelSize])7359         void BiasSignedInput(PixelBlockU8 inputNormalized[ParallelMath::ParallelSize], const PixelBlockS8 inputSigned[ParallelMath::ParallelSize])
7360         {
7361             for (size_t block = 0; block < ParallelMath::ParallelSize; block++)
7362             {
7363                 const PixelBlockS8& inputSignedBlock = inputSigned[block];
7364                 PixelBlockU8& inputNormalizedBlock = inputNormalized[block];
7365 
7366                 for (size_t px = 0; px < 16; px++)
7367                 {
7368                     for (size_t ch = 0; ch < 4; ch++)
7369                         inputNormalizedBlock.m_pixels[px][ch] = static_cast<uint8_t>(std::max<int>(inputSignedBlock.m_pixels[px][ch], -127) + 127);
7370                 }
7371             }
7372         }
7373 
FillWeights(const Options & options,float channelWeights[4])7374         void FillWeights(const Options &options, float channelWeights[4])
7375         {
7376             if (options.flags & Flags::Uniform)
7377                 channelWeights[0] = channelWeights[1] = channelWeights[2] = channelWeights[3] = 1.0f;
7378             else
7379             {
7380                 channelWeights[0] = options.redWeight;
7381                 channelWeights[1] = options.greenWeight;
7382                 channelWeights[2] = options.blueWeight;
7383                 channelWeights[3] = options.alphaWeight;
7384             }
7385         }
7386     }
7387 
7388     namespace Kernels
7389     {
EncodeBC7(uint8_t * pBC,const PixelBlockU8 * pBlocks,const cvtt::Options & options)7390         void EncodeBC7(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options)
7391         {
7392             assert(pBlocks);
7393             assert(pBC);
7394 
7395             float channelWeights[4];
7396             Internal::FillWeights(options, channelWeights);
7397 
7398             for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
7399             {
7400                 Internal::BC7Computer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, options.seedPoints, options.refineRoundsBC7);
7401                 pBC += ParallelMath::ParallelSize * 16;
7402             }
7403         }
7404 
EncodeBC6HU(uint8_t * pBC,const PixelBlockF16 * pBlocks,const cvtt::Options & options)7405         void EncodeBC6HU(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options)
7406         {
7407             assert(pBlocks);
7408             assert(pBC);
7409 
7410             float channelWeights[4];
7411             Internal::FillWeights(options, channelWeights);
7412 
7413             for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
7414             {
7415                 Internal::BC6HComputer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, false, options.seedPoints, options.refineRoundsBC6H);
7416                 pBC += ParallelMath::ParallelSize * 16;
7417             }
7418         }
7419 
EncodeBC6HS(uint8_t * pBC,const PixelBlockF16 * pBlocks,const cvtt::Options & options)7420         void EncodeBC6HS(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options)
7421         {
7422             assert(pBlocks);
7423             assert(pBC);
7424 
7425             float channelWeights[4];
7426             Internal::FillWeights(options, channelWeights);
7427 
7428             for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
7429             {
7430                 Internal::BC6HComputer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, true, options.seedPoints, options.refineRoundsBC6H);
7431                 pBC += ParallelMath::ParallelSize * 16;
7432             }
7433         }
7434 
EncodeBC1(uint8_t * pBC,const PixelBlockU8 * pBlocks,const cvtt::Options & options)7435         void EncodeBC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options)
7436         {
7437             assert(pBlocks);
7438             assert(pBC);
7439 
7440             float channelWeights[4];
7441             Internal::FillWeights(options, channelWeights);
7442 
7443             for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
7444             {
7445                 Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC, 8, channelWeights, true, options.threshold, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
7446                 pBC += ParallelMath::ParallelSize * 8;
7447             }
7448         }
7449 
EncodeBC2(uint8_t * pBC,const PixelBlockU8 * pBlocks,const Options & options)7450         void EncodeBC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
7451         {
7452             assert(pBlocks);
7453             assert(pBC);
7454 
7455             float channelWeights[4];
7456             Internal::FillWeights(options, channelWeights);
7457 
7458             for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
7459             {
7460                 Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC + 8, 16, channelWeights, false, 1.0f, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
7461                 Internal::S3TCComputer::PackExplicitAlpha(options.flags, pBlocks + blockBase, 3, pBC, 16);
7462                 pBC += ParallelMath::ParallelSize * 16;
7463             }
7464         }
7465 
EncodeBC3(uint8_t * pBC,const PixelBlockU8 * pBlocks,const Options & options)7466         void EncodeBC3(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
7467         {
7468             assert(pBlocks);
7469             assert(pBC);
7470 
7471             float channelWeights[4];
7472             Internal::FillWeights(options, channelWeights);
7473 
7474             for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
7475             {
7476                 Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC + 8, 16, channelWeights, false, 1.0f, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
7477                 Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 3, pBC, 16, false, options.seedPoints, options.refineRoundsIIC);
7478                 pBC += ParallelMath::ParallelSize * 16;
7479             }
7480         }
7481 
EncodeBC4U(uint8_t * pBC,const PixelBlockU8 * pBlocks,const Options & options)7482         void EncodeBC4U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
7483         {
7484             assert(pBlocks);
7485             assert(pBC);
7486 
7487             float channelWeights[4];
7488             Internal::FillWeights(options, channelWeights);
7489 
7490             for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
7491             {
7492                 Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 0, pBC, 8, false, options.seedPoints, options.refineRoundsIIC);
7493                 pBC += ParallelMath::ParallelSize * 8;
7494             }
7495         }
7496 
EncodeBC4S(uint8_t * pBC,const PixelBlockS8 * pBlocks,const Options & options)7497         void EncodeBC4S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options)
7498         {
7499             assert(pBlocks);
7500             assert(pBC);
7501 
7502             float channelWeights[4];
7503             Internal::FillWeights(options, channelWeights);
7504 
7505             for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
7506             {
7507                 PixelBlockU8 inputBlocks[ParallelMath::ParallelSize];
7508                 Internal::BiasSignedInput(inputBlocks, pBlocks + blockBase);
7509 
7510                 Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 8, true, options.seedPoints, options.refineRoundsIIC);
7511                 pBC += ParallelMath::ParallelSize * 8;
7512             }
7513         }
7514 
EncodeBC5U(uint8_t * pBC,const PixelBlockU8 * pBlocks,const Options & options)7515         void EncodeBC5U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
7516         {
7517             assert(pBlocks);
7518             assert(pBC);
7519 
7520             float channelWeights[4];
7521             Internal::FillWeights(options, channelWeights);
7522 
7523             for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
7524             {
7525                 Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 0, pBC, 16, false, options.seedPoints, options.refineRoundsIIC);
7526                 Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 1, pBC + 8, 16, false, options.seedPoints, options.refineRoundsIIC);
7527                 pBC += ParallelMath::ParallelSize * 16;
7528             }
7529         }
7530 
EncodeBC5S(uint8_t * pBC,const PixelBlockS8 * pBlocks,const Options & options)7531         void EncodeBC5S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options)
7532         {
7533             assert(pBlocks);
7534             assert(pBC);
7535 
7536             float channelWeights[4];
7537             Internal::FillWeights(options, channelWeights);
7538 
7539             for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
7540             {
7541                 PixelBlockU8 inputBlocks[ParallelMath::ParallelSize];
7542                 Internal::BiasSignedInput(inputBlocks, pBlocks + blockBase);
7543 
7544                 Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 16, true, options.seedPoints, options.refineRoundsIIC);
7545                 Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 1, pBC + 8, 16, true, options.seedPoints, options.refineRoundsIIC);
7546                 pBC += ParallelMath::ParallelSize * 16;
7547             }
7548         }
7549 
DecodeBC7(PixelBlockU8 * pBlocks,const uint8_t * pBC)7550         void DecodeBC7(PixelBlockU8 *pBlocks, const uint8_t *pBC)
7551         {
7552             assert(pBlocks);
7553             assert(pBC);
7554 
7555             for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
7556             {
7557                 Internal::BC7Computer::UnpackOne(pBlocks[blockBase], pBC);
7558                 pBC += 16;
7559             }
7560         }
7561 
DecodeBC6HU(PixelBlockF16 * pBlocks,const uint8_t * pBC)7562         void DecodeBC6HU(PixelBlockF16 *pBlocks, const uint8_t *pBC)
7563         {
7564             assert(pBlocks);
7565             assert(pBC);
7566 
7567             for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
7568             {
7569                 Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, false);
7570                 pBC += 16;
7571             }
7572         }
7573 
DecodeBC6HS(PixelBlockF16 * pBlocks,const uint8_t * pBC)7574         void DecodeBC6HS(PixelBlockF16 *pBlocks, const uint8_t *pBC)
7575         {
7576             assert(pBlocks);
7577             assert(pBC);
7578 
7579             for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
7580             {
7581                 Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, true);
7582                 pBC += 16;
7583             }
7584         }
7585     }
7586 }
7587