1 // stb_dxt.h - Real-Time DXT1/DXT5 compressor
2 // Based on original by fabian "ryg" giesen v1.04
3 // Custom version, modified by Yann Collet
4 //
5 /*
6    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
7 
8    Redistribution and use in source and binary forms, with or without
9    modification, are permitted provided that the following conditions are
10    met:
11 
12        * Redistributions of source code must retain the above copyright
13    notice, this list of conditions and the following disclaimer.
14        * Redistributions in binary form must reproduce the above
15    copyright notice, this list of conditions and the following disclaimer
16    in the documentation and/or other materials provided with the
17    distribution.
18 
19    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31    You can contact the author at :
32    - RygsDXTc source repository : http://code.google.com/p/rygsdxtc/
33 
34 */
35 // use '#define STB_DXT_IMPLEMENTATION' before including to create the implementation
36 //
37 // USAGE:
38 //   call stb_compress_dxt_block() for every block (you must pad)
39 //     source should be a 4x4 block of RGBA data in row-major order;
40 //     A is ignored if you specify alpha=0; you can turn on dithering
41 //     and "high quality" using mode.
42 //
43 // version history:
44 //   v1.06  - (cyan) implement Fabian Giesen's comments
45 //   v1.05  - (cyan) speed optimizations
46 //   v1.04  - (ryg) default to no rounding bias for lerped colors (as per S3TC/DX10 spec);
47 //            single color match fix (allow for inexact color interpolation);
48 //            optimal DXT5 index finder; "high quality" mode that runs multiple refinement steps.
49 //   v1.03  - (stb) endianness support
50 //   v1.02  - (stb) fix alpha encoding bug
51 //   v1.01  - (stb) fix bug converting to RGB that messed up quality, thanks ryg & cbloom
52 //   v1.00  - (stb) first release
53 
54 #ifndef STB_INCLUDE_STB_DXT_H
55 #define STB_INCLUDE_STB_DXT_H
56 
57 
58 //*******************************************************************
59 // Enable custom Optimisations
60 // Comment this define if you want to revert to ryg's original code
61 #define NEW_OPTIMISATIONS
62 //*******************************************************************
63 
64 // compression mode (bitflags)
65 #define STB_DXT_NORMAL    0
66 #define STB_DXT_DITHER    1   // use dithering. dubious win. never use for normal maps and the like!
67 #define STB_DXT_HIGHQUAL  2   // high quality mode, does two refinement steps instead of 1. ~30-40% slower.
68 
69 // The original signature has been modified by adding the parameter compressed_size which returns
70 // the size in bytes of the compressed data contained into dst
71 void rygCompress(unsigned char *dst, unsigned char *src, int w, int h, int isDxt5, int& compressed_size);
72 
73 // TODO remove these, not working properly..
74 void rygCompressYCoCg( unsigned char *dst, unsigned char *src, int w, int h );
75 void linearize( unsigned char * dst, const unsigned char * src, int n );
76 
77 void stb_compress_dxt_block(unsigned char *dest, const unsigned char *src, int alpha, int mode);
78 #define STB_COMPRESS_DXT_BLOCK
79 
80 #ifdef STB_DXT_IMPLEMENTATION
81 
82 // configuration options for DXT encoder. set them in the project/makefile or just define
83 // them at the top.
84 
85 // STB_DXT_USE_ROUNDING_BIAS
86 //     use a rounding bias during color interpolation. this is closer to what "ideal"
87 //     interpolation would do but doesn't match the S3TC/DX10 spec. old versions (pre-1.03)
88 //     implicitly had this turned on.
89 //
90 //     in case you're targeting a specific type of hardware (e.g. console programmers):
91 //     NVidia and Intel GPUs (as of 2010) as well as DX9 ref use DXT decoders that are closer
92 //     to STB_DXT_USE_ROUNDING_BIAS. AMD/ATI, S3 and DX10 ref are closer to rounding with no bias.
93 //     you also see "(a*5 + b*3) / 8" on some old GPU designs.
94 // #define STB_DXT_USE_ROUNDING_BIAS
95 
96 #include <stdlib.h>
97 #include <math.h>
98 #include <stddef.h>
99 #include <string.h> // memset
100 #include <assert.h>
101 #include <iostream>
102 #include <algorithm>
103 
104 
105 static unsigned char stb__Expand5[32];
106 static unsigned char stb__Expand6[64];
107 static unsigned char stb__OMatch5[256][2];
108 static unsigned char stb__OMatch6[256][2];
109 static unsigned char stb__QuantRBTab[256+16];
110 static unsigned char stb__QuantGTab[256+16];
111 
stb__Mul8Bit(int a,int b)112 static int stb__Mul8Bit(int a, int b)
113 {
114   int t = a*b + 128;
115   return (t + (t >> 8)) >> 8;
116 }
117 
stb__From16Bit(unsigned char * out,unsigned short v)118 static void stb__From16Bit(unsigned char *out, unsigned short v)
119 {
120    int rv = (v & 0xf800) >> 11;
121    int gv = (v & 0x07e0) >>  5;
122    int bv = (v & 0x001f) >>  0;
123 
124    out[0] = stb__Expand5[rv];
125    out[1] = stb__Expand6[gv];
126    out[2] = stb__Expand5[bv];
127    out[3] = 0;
128 }
129 
stb__As16Bit(int r,int g,int b)130 static unsigned short stb__As16Bit(int r, int g, int b)
131 {
132    return (stb__Mul8Bit(r,31) << 11) + (stb__Mul8Bit(g,63) << 5) + stb__Mul8Bit(b,31);
133 }
134 
135 // linear interpolation at 1/3 point between a and b, using desired rounding type
stb__Lerp13(int a,int b)136 static int stb__Lerp13(int a, int b)
137 {
138 #ifdef STB_DXT_USE_ROUNDING_BIAS
139    // with rounding bias
140    return a + stb__Mul8Bit(b-a, 0x55);
141 #else
142    // without rounding bias
143    // replace "/ 3" by "* 0xaaab) >> 17" if your compiler sucks or you really need every ounce of speed.
144    return (2*a + b) / 3;
145 #endif
146 }
147 
148 // lerp RGB color
stb__Lerp13RGB(unsigned char * out,unsigned char * p1,unsigned char * p2)149 static void stb__Lerp13RGB(unsigned char *out, unsigned char *p1, unsigned char *p2)
150 {
151    out[0] = stb__Lerp13(p1[0], p2[0]);
152    out[1] = stb__Lerp13(p1[1], p2[1]);
153    out[2] = stb__Lerp13(p1[2], p2[2]);
154 }
155 
156 /****************************************************************************/
157 
158 // compute table to reproduce constant colors as accurately as possible
stb__PrepareOptTable(unsigned char * Table,const unsigned char * expand,int size)159 static void stb__PrepareOptTable(unsigned char *Table,const unsigned char *expand,int size)
160 {
161    int i,mn,mx;
162    for (i=0;i<256;i++) {
163       int bestErr = 256;
164       for (mn=0;mn<size;mn++) {
165          for (mx=0;mx<size;mx++) {
166             int mine = expand[mn];
167             int maxe = expand[mx];
168             int err = abs(stb__Lerp13(maxe, mine) - i);
169 
170             // DX10 spec says that interpolation must be within 3% of "correct" result,
171             // add this as error term. (normally we'd expect a random distribution of
172             // +-1.5% error, but nowhere in the spec does it say that the error has to be
173             // unbiased - better safe than sorry).
174             err += abs(maxe - mine) * 3 / 100;
175 
176             if(err < bestErr)
177             {
178                Table[i*2+0] = mx;
179                Table[i*2+1] = mn;
180                bestErr = err;
181             }
182          }
183       }
184    }
185 }
186 
stb__EvalColors(unsigned char * color,unsigned short c0,unsigned short c1)187 static void stb__EvalColors(unsigned char *color,unsigned short c0,unsigned short c1)
188 {
189    stb__From16Bit(color+ 0, c0);
190    stb__From16Bit(color+ 4, c1);
191    stb__Lerp13RGB(color+ 8, color+0, color+4);
192    stb__Lerp13RGB(color+12, color+4, color+0);
193 }
194 
195 // Block dithering function. Simply dithers a block to 565 RGB.
196 // (Floyd-Steinberg)
stb__DitherBlock(unsigned char * dest,unsigned char * block)197 static void stb__DitherBlock(unsigned char *dest, unsigned char *block)
198 {
199   int err[8],*ep1 = err,*ep2 = err+4, *et;
200   int ch,y;
201 
202   // process channels seperately
203   for (ch=0; ch<3; ++ch) {
204       unsigned char *bp = block+ch, *dp = dest+ch;
205       unsigned char *quant = (ch == 1) ? stb__QuantGTab+8 : stb__QuantRBTab+8;
206       memset(err, 0, sizeof(err));
207       for(y=0; y<4; ++y) {
208          dp[ 0] = quant[bp[ 0] + ((3*ep2[1] + 5*ep2[0]) >> 4)];
209          ep1[0] = bp[ 0] - dp[ 0];
210          dp[ 4] = quant[bp[ 4] + ((7*ep1[0] + 3*ep2[2] + 5*ep2[1] + ep2[0]) >> 4)];
211          ep1[1] = bp[ 4] - dp[ 4];
212          dp[ 8] = quant[bp[ 8] + ((7*ep1[1] + 3*ep2[3] + 5*ep2[2] + ep2[1]) >> 4)];
213          ep1[2] = bp[ 8] - dp[ 8];
214          dp[12] = quant[bp[12] + ((7*ep1[2] + 5*ep2[3] + ep2[2]) >> 4)];
215          ep1[3] = bp[12] - dp[12];
216          bp += 16;
217          dp += 16;
218          et = ep1, ep1 = ep2, ep2 = et; // swap
219       }
220    }
221 }
222 
223 // The color matching function
stb__MatchColorsBlock(unsigned char * block,unsigned char * color,int dither)224 static unsigned int stb__MatchColorsBlock(unsigned char *block, unsigned char *color,int dither)
225 {
226    unsigned int mask = 0;
227    int dirr = color[0*4+0] - color[1*4+0];
228    int dirg = color[0*4+1] - color[1*4+1];
229    int dirb = color[0*4+2] - color[1*4+2];
230    int dots[16];
231    int stops[4];
232    int i;
233    int c0Point, halfPoint, c3Point;
234 
235    for(i=0;i<16;i++)
236       dots[i] = block[i*4+0]*dirr + block[i*4+1]*dirg + block[i*4+2]*dirb;
237 
238    for(i=0;i<4;i++)
239       stops[i] = color[i*4+0]*dirr + color[i*4+1]*dirg + color[i*4+2]*dirb;
240 
241    // think of the colors as arranged on a line; project point onto that line, then choose
242    // next color out of available ones. we compute the crossover points for "best color in top
243    // half"/"best in bottom half" and then the same inside that subinterval.
244    //
245    // relying on this 1d approximation isn't always optimal in terms of euclidean distance,
246    // but it's very close and a lot faster.
247    // http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html
248 
249    c0Point   = (stops[1] + stops[3]) >> 1;
250    halfPoint = (stops[3] + stops[2]) >> 1;
251    c3Point   = (stops[2] + stops[0]) >> 1;
252 
253    if(!dither)
254    {
255       // the version without dithering is straightforward
256 
257 #ifdef NEW_OPTIMISATIONS
258       const int indexMap[8] = { 0 << 30,2 << 30,0 << 30,2 << 30,3 << 30,3 << 30,1 << 30,1 << 30 };
259 
260       for(int i=0;i<16;i++)
261       {
262         int dot = dots[i];
263         mask >>= 2;
264 
265         int bits =( (dot < halfPoint) ? 4 : 0 )
266                 | ( (dot < c0Point) ? 2 : 0 )
267                 | ( (dot < c3Point) ? 1 : 0 );
268 
269         mask |= indexMap[bits];
270       }
271 
272 #else
273       for (i=15;i>=0;i--) {
274          int dot = dots[i];
275          mask <<= 2;
276 
277          if(dot < halfPoint)
278            mask |= (dot < c0Point) ? 1 : 3;
279          else
280            mask |= (dot < c3Point) ? 2 : 0;
281       }
282 #endif
283 
284   } else {
285       // with floyd-steinberg dithering
286       int err[8],*ep1 = err,*ep2 = err+4;
287       int *dp = dots, y;
288 
289       c0Point   <<= 4;
290       halfPoint <<= 4;
291       c3Point   <<= 4;
292       for(i=0;i<8;i++)
293          err[i] = 0;
294 
295       for(y=0;y<4;y++)
296       {
297          int dot,lmask,step;
298 
299          dot = (dp[0] << 4) + (3*ep2[1] + 5*ep2[0]);
300          if(dot < halfPoint)
301            step = (dot < c0Point) ? 1 : 3;
302          else
303            step = (dot < c3Point) ? 2 : 0;
304          ep1[0] = dp[0] - stops[step];
305          lmask = step;
306 
307          dot = (dp[1] << 4) + (7*ep1[0] + 3*ep2[2] + 5*ep2[1] + ep2[0]);
308          if(dot < halfPoint)
309            step = (dot < c0Point) ? 1 : 3;
310          else
311            step = (dot < c3Point) ? 2 : 0;
312          ep1[1] = dp[1] - stops[step];
313          lmask |= step<<2;
314 
315          dot = (dp[2] << 4) + (7*ep1[1] + 3*ep2[3] + 5*ep2[2] + ep2[1]);
316          if(dot < halfPoint)
317            step = (dot < c0Point) ? 1 : 3;
318          else
319            step = (dot < c3Point) ? 2 : 0;
320          ep1[2] = dp[2] - stops[step];
321          lmask |= step<<4;
322 
323          dot = (dp[3] << 4) + (7*ep1[2] + 5*ep2[3] + ep2[2]);
324          if(dot < halfPoint)
325            step = (dot < c0Point) ? 1 : 3;
326          else
327            step = (dot < c3Point) ? 2 : 0;
328          ep1[3] = dp[3] - stops[step];
329          lmask |= step<<6;
330 
331          dp += 4;
332          mask |= lmask << (y*8);
333          { int *et = ep1; ep1 = ep2; ep2 = et; } // swap
334       }
335    }
336 
337    return mask;
338 }
339 
340 // The color optimization function. (Clever code, part 1)
stb__OptimizeColorsBlock(unsigned char * block,unsigned short * pmax16,unsigned short * pmin16)341 static void stb__OptimizeColorsBlock(unsigned char *block, unsigned short *pmax16, unsigned short *pmin16)
342 {
343   unsigned char *minp, *maxp;
344   double magn;
345   int v_r,v_g,v_b;
346   static const int nIterPower = 4;
347   float covf[6],vfr,vfg,vfb;
348 
349   // determine color distribution
350   int cov[6];
351   int mu[3],min[3],max[3];
352   int ch,i,iter;
353 
354   for(ch=0;ch<3;ch++)
355   {
356     const unsigned char *bp = ((const unsigned char *) block) + ch;
357     int muv,minv,maxv;
358 
359 #ifdef NEW_OPTIMISATIONS
360 #   define MIN(a,b)      (int)a + ( ((int)b-a) & ( ((int)b-a) >> 31 ) )
361 #   define MAX(a,b)      (int)a + ( ((int)b-a) & ( ((int)a-b) >> 31 ) )
362 #   define RANGE(a,b,n)  int min##n = MIN(a,b); int max##n = a+b - min##n; muv += a+b;
363 #   define MINMAX(a,b,n) int min##n = MIN(min##a, min##b); int max##n = MAX(max##a, max##b);
364 
365 	muv = 0;
366 	RANGE(bp[0],  bp[4],  1);
367 	RANGE(bp[8],  bp[12], 2);
368 	RANGE(bp[16], bp[20], 3);
369 	RANGE(bp[24], bp[28], 4);
370 	RANGE(bp[32], bp[36], 5);
371 	RANGE(bp[40], bp[44], 6);
372 	RANGE(bp[48], bp[52], 7);
373 	RANGE(bp[56], bp[60], 8);
374 
375 	MINMAX(1,2,9);
376 	MINMAX(3,4,10);
377 	MINMAX(5,6,11);
378 	MINMAX(7,8,12);
379 
380 	MINMAX(9,10,13);
381 	MINMAX(11,12,14);
382 
383 	minv = MIN(min13,min14);
384 	maxv = MAX(max13,max14);
385 
386 #else
387 	muv = minv = maxv = bp[0];
388     for(i=4;i<64;i+=4)
389     {
390       muv += bp[i];
391       if (bp[i] < minv) minv = bp[i];
392       else if (bp[i] > maxv) maxv = bp[i];
393     }
394 #endif
395 
396     mu[ch] = (muv + 8) >> 4;
397     min[ch] = minv;
398     max[ch] = maxv;
399   }
400 
401   // determine covariance matrix
402   for (i=0;i<6;i++)
403      cov[i] = 0;
404 
405   for (i=0;i<16;i++)
406   {
407     int r = block[i*4+0] - mu[0];
408     int g = block[i*4+1] - mu[1];
409     int b = block[i*4+2] - mu[2];
410 
411     cov[0] += r*r;
412     cov[1] += r*g;
413     cov[2] += r*b;
414     cov[3] += g*g;
415     cov[4] += g*b;
416     cov[5] += b*b;
417   }
418 
419   // convert covariance matrix to float, find principal axis via power iter
420   for(i=0;i<6;i++)
421     covf[i] = cov[i] / 255.0f;
422 
423   vfr = (float) (max[0] - min[0]);
424   vfg = (float) (max[1] - min[1]);
425   vfb = (float) (max[2] - min[2]);
426 
427   for(iter=0;iter<nIterPower;iter++)
428   {
429     float r = vfr*covf[0] + vfg*covf[1] + vfb*covf[2];
430     float g = vfr*covf[1] + vfg*covf[3] + vfb*covf[4];
431     float b = vfr*covf[2] + vfg*covf[4] + vfb*covf[5];
432 
433     vfr = r;
434     vfg = g;
435     vfb = b;
436   }
437 
438   magn = fabs(vfr);
439   if (fabs(vfg) > magn) magn = fabs(vfg);
440   if (fabs(vfb) > magn) magn = fabs(vfb);
441 
442    if(magn < 4.0f)
443    { // too small, default to luminance
444       v_r = 299; // JPEG YCbCr luma coefs, scaled by 1000.
445       v_g = 587;
446       v_b = 114;
447    } else {
448       magn = 512.0 / magn;
449       v_r = (int) (vfr * magn);
450       v_g = (int) (vfg * magn);
451       v_b = (int) (vfb * magn);
452    }
453 
454 
455 #ifdef NEW_OPTIMISATIONS
456    // Pick colors at extreme points
457    int mind, maxd;
458    mind = maxd = block[0]*v_r + block[1]*v_g + block[2]*v_b;
459    minp = maxp = block;
460    for(i=1;i<16;i++)
461    {
462       int dot = block[i*4+0]*v_r + block[i*4+1]*v_g + block[i*4+2]*v_b;
463 
464       if (dot < mind) {
465          mind = dot;
466          minp = block+i*4;
467 		 continue;
468       }
469 
470       if (dot > maxd) {
471          maxd = dot;
472          maxp = block+i*4;
473       }
474    }
475 #else
476    int mind = 0x7fffffff,maxd = -0x7fffffff;
477    // Pick colors at extreme points
478    for(i=0;i<16;i++)
479    {
480       int dot = block[i*4+0]*v_r + block[i*4+1]*v_g + block[i*4+2]*v_b;
481 
482       if (dot < mind) {
483          mind = dot;
484          minp = block+i*4;
485       }
486 
487       if (dot > maxd) {
488          maxd = dot;
489          maxp = block+i*4;
490       }
491    }
492 #endif
493 
494    *pmax16 = stb__As16Bit(maxp[0],maxp[1],maxp[2]);
495    *pmin16 = stb__As16Bit(minp[0],minp[1],minp[2]);
496 }
497 
stb__sclamp(float y,int p0,int p1)498 inline static int stb__sclamp(float y, int p0, int p1)
499 {
500    int x = (int) y;
501 
502 #ifdef NEW_OPTIMISATIONS
503 	x = x>p1 ? p1 : x;
504     return x<p0 ? p0 : x;
505 #else
506    if (x < p0) return p0;
507    if (x > p1) return p1;
508    return x;
509 #endif
510 }
511 
512 // The refinement function. (Clever code, part 2)
513 // Tries to optimize colors to suit block contents better.
514 // (By solving a least squares system via normal equations+Cramer's rule)
stb__RefineBlock(unsigned char * block,unsigned short * pmax16,unsigned short * pmin16,unsigned int mask)515 static int stb__RefineBlock(unsigned char *block, unsigned short *pmax16, unsigned short *pmin16, unsigned int mask)
516 {
517    static const int w1Tab[4] = { 3,0,2,1 };
518    static const int prods[4] = { 0x090000,0x000900,0x040102,0x010402 };
519    // ^some magic to save a lot of multiplies in the accumulating loop...
520    // (precomputed products of weights for least squares system, accumulated inside one 32-bit register)
521 
522    float frb,fg;
523    unsigned short oldMin, oldMax, min16, max16;
524    int i, akku = 0, xx,xy,yy;
525    int At1_r,At1_g,At1_b;
526    int At2_r,At2_g,At2_b;
527    unsigned int cm = mask;
528 
529    oldMin = *pmin16;
530    oldMax = *pmax16;
531 
532    if((mask ^ (mask<<2)) < 4) // all pixels have the same index?
533    {
534       // yes, linear system would be singular; solve using optimal
535       // single-color match on average color
536       int r = 8, g = 8, b = 8;
537       for (i=0;i<16;++i) {
538          r += block[i*4+0];
539          g += block[i*4+1];
540          b += block[i*4+2];
541       }
542 
543       r >>= 4; g >>= 4; b >>= 4;
544 
545       max16 = (stb__OMatch5[r][0]<<11) | (stb__OMatch6[g][0]<<5) | stb__OMatch5[b][0];
546       min16 = (stb__OMatch5[r][1]<<11) | (stb__OMatch6[g][1]<<5) | stb__OMatch5[b][1];
547    } else {
548       At1_r = At1_g = At1_b = 0;
549       At2_r = At2_g = At2_b = 0;
550       for (i=0;i<16;++i,cm>>=2)
551 	  {
552          int step = cm&3;
553          int w1 = w1Tab[step];
554          int r = block[i*4+0];
555          int g = block[i*4+1];
556          int b = block[i*4+2];
557 
558          akku    += prods[step];
559          At1_r   += w1*r;
560          At1_g   += w1*g;
561          At1_b   += w1*b;
562          At2_r   += r;
563          At2_g   += g;
564          At2_b   += b;
565       }
566 
567       At2_r = 3*At2_r - At1_r;
568       At2_g = 3*At2_g - At1_g;
569       At2_b = 3*At2_b - At1_b;
570 
571       // extract solutions and decide solvability
572       xx = akku >> 16;
573       yy = (akku >> 8) & 0xff;
574       xy = (akku >> 0) & 0xff;
575 
576       frb = 3.0f * 31.0f / 255.0f / (xx*yy - xy*xy);
577       fg = frb * 63.0f / 31.0f;
578 
579       // solve.
580       max16 =   stb__sclamp((At1_r*yy - At2_r*xy)*frb+0.5f,0,31) << 11;
581       max16 |=  stb__sclamp((At1_g*yy - At2_g*xy)*fg +0.5f,0,63) << 5;
582       max16 |=  stb__sclamp((At1_b*yy - At2_b*xy)*frb+0.5f,0,31) << 0;
583 
584       min16 =   stb__sclamp((At2_r*xx - At1_r*xy)*frb+0.5f,0,31) << 11;
585       min16 |=  stb__sclamp((At2_g*xx - At1_g*xy)*fg +0.5f,0,63) << 5;
586       min16 |=  stb__sclamp((At2_b*xx - At1_b*xy)*frb+0.5f,0,31) << 0;
587    }
588 
589    *pmin16 = min16;
590    *pmax16 = max16;
591    return oldMin != min16 || oldMax != max16;
592 }
593 
594 // Color block compression
stb__CompressColorBlock(unsigned char * dest,unsigned char * block,int mode)595 static void stb__CompressColorBlock(unsigned char *dest, unsigned char *block, int mode)
596 {
597    unsigned int mask;
598    int i;
599    int dither;
600    int refinecount;
601    unsigned short max16, min16;
602    unsigned char dblock[16*4],color[4*4];
603 
604    dither = mode & STB_DXT_DITHER;
605    refinecount = (mode & STB_DXT_HIGHQUAL) ? 2 : 1;
606 
607    // check if block is constant
608    for (i=1;i<16;i++)
609       if (((unsigned int *) block)[i] != ((unsigned int *) block)[0])
610          break;
611 
612    if(i == 16)
613    { // constant color
614       int r = block[0], g = block[1], b = block[2];
615       mask  = 0xaaaaaaaa;
616       max16 = (stb__OMatch5[r][0]<<11) | (stb__OMatch6[g][0]<<5) | stb__OMatch5[b][0];
617       min16 = (stb__OMatch5[r][1]<<11) | (stb__OMatch6[g][1]<<5) | stb__OMatch5[b][1];
618    } else
619    {
620       // first step: compute dithered version for PCA if desired
621       if(dither)
622          stb__DitherBlock(dblock,block);
623 
624       // second step: pca+map along principal axis
625       stb__OptimizeColorsBlock(dither ? dblock : block,&max16,&min16);
626       if (max16 != min16)
627 	  {
628          stb__EvalColors(color,max16,min16);
629          mask = stb__MatchColorsBlock(block,color,dither);
630       } else
631          mask = 0;
632 
633       // third step: refine (multiple times if requested)
634       for (i=0;i<refinecount;i++) {
635          unsigned int lastmask = mask;
636 
637          if (stb__RefineBlock(dither ? dblock : block,&max16,&min16,mask))
638 		 {
639             if (max16 != min16)
640 			{
641                stb__EvalColors(color,max16,min16);
642                mask = stb__MatchColorsBlock(block,color,dither);
643             } else
644 			{
645                mask = 0;
646                break;
647             }
648          }
649 
650          if(mask == lastmask)
651             break;
652       }
653   }
654 
655   // write the color block
656   if(max16 < min16)
657   {
658      unsigned short t = min16;
659      min16 = max16;
660      max16 = t;
661      mask ^= 0x55555555;
662   }
663 
664   dest[0] = (unsigned char) (max16);
665   dest[1] = (unsigned char) (max16 >> 8);
666   dest[2] = (unsigned char) (min16);
667   dest[3] = (unsigned char) (min16 >> 8);
668   dest[4] = (unsigned char) (mask);
669   dest[5] = (unsigned char) (mask >> 8);
670   dest[6] = (unsigned char) (mask >> 16);
671   dest[7] = (unsigned char) (mask >> 24);
672 }
673 
674 // Alpha block compression (this is easy for a change)
stb__CompressAlphaBlock(unsigned char * dest,unsigned char * src,int mode)675 static void stb__CompressAlphaBlock(unsigned char *dest,unsigned char *src,int mode)
676 {
677    int i,dist,bias,dist4,dist2,bits,mask;
678 
679    // find min/max color
680    int mn,mx;
681 
682    mn = mx = src[3];
683    for (i=1;i<16;i++)
684    {
685       if (src[i*4+3] < mn) mn = src[i*4+3];
686       else if (src[i*4+3] > mx) mx = src[i*4+3];
687    }
688 
689    // encode them
690    ((unsigned char *)dest)[0] = mx;
691    ((unsigned char *)dest)[1] = mn;
692    dest += 2;
693 
694 #ifdef NEW_OPTIMISATIONS
695    // mono-alpha shortcut
696    if (mn==mx)
697    {
698 	   *(unsigned short*)dest = 0;
699 	   dest += 2;
700 	   *(unsigned int*)dest = 0;
701 	   return;
702    }
703 #endif
704 
705 	// determine bias and emit color indices
706 	// given the choice of mx/mn, these indices are optimal:
707 	// http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
708 	dist = mx-mn;
709 	//printf("mn = %i; mx = %i; dist = %i\n", mn, mx, dist);
710 	dist4 = dist*4;
711 	dist2 = dist*2;
712 	bias = (dist < 8) ? (dist - 1) : (dist/2 + 2);
713 	bias -= mn * 7;
714 	bits = 0, mask=0;
715 
716 	for (i=0;i<16;i++)
717 	{
718 		int a = src[i*4+3]*7 + bias;
719 		int ind,t;
720 
721 		// select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
722 		t = (a >= dist4) ? -1 : 0; ind =  t & 4; a -= dist4 & t;
723 		t = (a >= dist2) ? -1 : 0; ind += t & 2; a -= dist2 & t;
724 		ind += (a >= dist);
725 
726 		// turn linear scale into DXT index (0/1 are extremal pts)
727 		ind = -ind & 7;
728 		ind ^= (2 > ind);
729 
730 		// write index
731 		mask |= ind << bits;
732 		if((bits += 3) >= 8)
733 		{
734 			*dest++ = mask;
735 			mask >>= 8;
736 			bits -= 8;
737 		}
738 	}
739 }
740 
741 
stb__InitDXT()742 static void stb__InitDXT()
743 {
744    int i;
745    for(i=0;i<32;i++)
746       stb__Expand5[i] = (i<<3)|(i>>2);
747 
748    for(i=0;i<64;i++)
749       stb__Expand6[i] = (i<<2)|(i>>4);
750 
751    for(i=0;i<256+16;i++)
752    {
753       int v = i-8 < 0 ? 0 : i-8 > 255 ? 255 : i-8;
754       stb__QuantRBTab[i] = stb__Expand5[stb__Mul8Bit(v,31)];
755       stb__QuantGTab[i] = stb__Expand6[stb__Mul8Bit(v,63)];
756    }
757 
758    stb__PrepareOptTable(&stb__OMatch5[0][0],stb__Expand5,32);
759    stb__PrepareOptTable(&stb__OMatch6[0][0],stb__Expand6,64);
760 }
761 
762 
stb_compress_dxt_block(unsigned char * dest,const unsigned char * src,int alpha,int mode)763 void stb_compress_dxt_block(unsigned char *dest, const unsigned char *src, int alpha, int mode)
764 {
765    static int init=1;
766    if (init)
767    {
768       stb__InitDXT();
769       init=0;
770    }
771 
772    if (alpha)
773    {
774       stb__CompressAlphaBlock(dest,(unsigned char*) src,mode);
775       dest += 8;
776    }
777 
778    stb__CompressColorBlock(dest,(unsigned char*) src,mode);
779 }
780 
imin(int x,int y)781 int imin(int x, int y) { return (x < y) ? x : y; }
782 
783 
784 
785 
786 
extractBlock(const unsigned char * src,int x,int y,int w,int h,unsigned char * block)787 static void extractBlock(const unsigned char *src, int x, int y,
788                           int w, int h, unsigned char *block)
789 {
790    int i, j;
791 
792 #ifdef NEW_OPTIMISATIONS
793    if ((w-x >=4) && (h-y >=4))
794    {
795 	   // Full Square shortcut
796 	   src += x*4;
797 	   src += y*w*4;
798 	   for (i=0; i < 4; ++i)
799 	   {
800 		   *(unsigned int*)block = *(unsigned int*) src; block += 4; src += 4;
801 		   *(unsigned int*)block = *(unsigned int*) src; block += 4; src += 4;
802 		   *(unsigned int*)block = *(unsigned int*) src; block += 4; src += 4;
803 		   *(unsigned int*)block = *(unsigned int*) src; block += 4;
804 		   src += (w*4) - 12;
805 	   }
806 	   return;
807    }
808 #endif
809 
810    int bw = imin(w - x, 4);
811    int bh = imin(h - y, 4);
812    int bx, by;
813 
814    const int rem[] =
815    {
816       0, 0, 0, 0,
817       0, 1, 0, 1,
818       0, 1, 2, 0,
819       0, 1, 2, 3
820    };
821 
822    for(i = 0; i < 4; ++i)
823    {
824       by = rem[(bh - 1) * 4 + i] + y;
825       for(j = 0; j < 4; ++j)
826       {
827          bx = rem[(bw - 1) * 4 + j] + x;
828          block[(i * 4 * 4) + (j * 4) + 0] =
829             src[(by * (w * 4)) + (bx * 4) + 0];
830          block[(i * 4 * 4) + (j * 4) + 1] =
831             src[(by * (w * 4)) + (bx * 4) + 1];
832          block[(i * 4 * 4) + (j * 4) + 2] =
833             src[(by * (w * 4)) + (bx * 4) + 2];
834          block[(i * 4 * 4) + (j * 4) + 3] =
835             src[(by * (w * 4)) + (bx * 4) + 3];
836       }
837    }
838 }
839 
840  // should be a pretty optimized 0-255 clamper
clamp255(int n)841 inline static unsigned char clamp255( int n )
842 {
843   if( n > 255 ) n = 255;
844   if( n < 0 ) n = 0;
845   return n;
846 }
847 
848 
rgbToYCoCgBlock(unsigned char * dst,const unsigned char * src)849 void rgbToYCoCgBlock( unsigned char * dst, const unsigned char * src )
850 {
851     // Calculate Co and Cg extents
852     int extents = 0;
853     int n = 0;
854     int iY, iCo, iCg; //, r, g, b;
855     int blockCo[16];
856     int blockCg[16];
857     int i;
858 
859     const unsigned char *px = src;
860     for(i=0;i<n;i++)
861     {
862         iCo = (px[0]<<1) - (px[2]<<1);
863         iCg = (px[1]<<1) - px[0] - px[2];
864         if(-iCo > extents) extents = -iCo;
865         if( iCo > extents) extents = iCo;
866         if(-iCg > extents) extents = -iCg;
867         if( iCg > extents) extents = iCg;
868 
869         blockCo[n] = iCo;
870         blockCg[n++] = iCg;
871 
872         px += 4;
873     }
874 
875     // Co = -510..510
876     // Cg = -510..510
877     float scaleFactor = 1.0f;
878     if(extents > 127)
879         scaleFactor = (float)extents * 4.0f / 510.0f;
880 
881     // Convert to quantized scalefactor
882     unsigned char scaleFactorQuantized = (unsigned char)(ceil((scaleFactor - 1.0f) * 31.0f / 3.0f));
883 
884     // Unquantize
885     scaleFactor = 1.0f + (float)(scaleFactorQuantized / 31.0f) * 3.0f;
886 
887     unsigned char bVal = (unsigned char)((scaleFactorQuantized << 3) | (scaleFactorQuantized >> 2));
888 
889     unsigned char *outPx = dst;
890 
891     n = 0;
892     px = src;
893     /*
894     for(i=0;i<16;i++)
895     {
896         // Calculate components
897         iY = ( px[0] + (px[1]<<1) + px[2] + 2 ) / 4;
898         iCo = ((blockCo[n] / scaleFactor) + 128);
899         iCg = ((blockCg[n] / scaleFactor) + 128);
900 
901         if(iCo < 0) iCo = 0; else if(iCo > 255) iCo = 255;
902         if(iCg < 0) iCg = 0; else if(iCg > 255) iCg = 255;
903         if(iY < 0) iY = 0; else if(iY > 255) iY = 255;
904 
905         px += 4;
906 
907         outPx[0] = (unsigned char)iCo;
908         outPx[1] = (unsigned char)iCg;
909         outPx[2] = bVal;
910         outPx[3] = (unsigned char)iY;
911 
912         outPx += 4;
913     }*/
914     for(i=0;i<16;i++)
915     {
916         // Calculate components
917         int r = px[0];
918         int g = (px[1] + 1) >> 1;
919         int b = px[2];
920         int tmp = (2 + r + b) >> 2;
921 
922         // Co
923         iCo = clamp255( 128 + ((r - b + 1) >> 1) );
924         // Y
925         iY = clamp255( g + tmp );
926         // Cg
927         iCg = clamp255( 128 + g - tmp );
928 
929         px += 4;
930 
931         outPx[0] = (unsigned char)iCo;
932         outPx[1] = (unsigned char)iCg;
933         outPx[2] = bVal;
934         outPx[3] = (unsigned char)iY;
935 
936         outPx += 4;
937     }
938 
939 }
940 
941 
rygCompress(unsigned char * dst,unsigned char * src,int w,int h,int isDxt5,int & compressed_size)942 void rygCompress(unsigned char *dst, unsigned char *src, int w, int h, int isDxt5, int& compressed_size)
943 {
944 
945    unsigned char block[64];
946    int x, y;
947 
948    unsigned char* initial_dst = dst;
949 
950    for (y = 0; y < h; y += 4)
951    {
952       for(x = 0; x < w; x += 4)
953       {
954          extractBlock(src, x, y, w, h, block);
955          stb_compress_dxt_block(dst, block, isDxt5, STB_DXT_NORMAL);
956          dst += isDxt5 ? 16 : 8;
957       }
958    }
959 
960    compressed_size = dst - initial_dst;
961 }
962 
rygCompressYCoCg(unsigned char * dst,unsigned char * src,int w,int h)963 void rygCompressYCoCg( unsigned char *dst, unsigned char *src, int w, int h )
964 {
965     unsigned char block[64];
966    unsigned char ycocgblock[64];
967    int x, y;
968 
969    for(y = 0; y < h; y += 4)
970    {
971       for(x = 0; x < w; x += 4)
972       {
973          extractBlock(src, x, y, w, h, block);
974          rgbToYCoCgBlock(ycocgblock,block);
975          stb_compress_dxt_block(dst, ycocgblock, 1, 10);
976          dst += 16;
977       }
978    }
979 
980 }
981 
stbgl__compress(unsigned char * p,unsigned char * rgba,int w,int h,int isDxt5)982 static void stbgl__compress(unsigned char *p, unsigned char *rgba, int w, int h, int isDxt5)
983 {
984    int i,j,y,y2;
985    int alpha = isDxt5;
986 
987    for (j=0; j < w; j += 4) {
988       int x=4;
989       for (i=0; i < h; i += 4) {
990          unsigned char block[16*4];
991          if (i+3 >= w) x = w-i;
992          for (y=0; y < 4; ++y) {
993             if (j+y >= h) break;
994             memcpy(block+y*16, rgba + w*4*(j+y) + i*4, x*4);
995          }
996          if (x < 4) {
997             switch (x) {
998                case 0: assert(0);
999                case 1:
1000                   for (y2=0; y2 < y; ++y2) {
1001                      memcpy(block+y2*16+1*4, block+y2*16+0*4, 4);
1002                      memcpy(block+y2*16+2*4, block+y2*16+0*4, 8);
1003                   }
1004                   break;
1005                case 2:
1006                   for (y2=0; y2 < y; ++y2)
1007                      memcpy(block+y2*16+2*4, block+y2*16+0*4, 8);
1008                   break;
1009                case 3:
1010                   for (y2=0; y2 < y; ++y2)
1011                      memcpy(block+y2*16+3*4, block+y2*16+1*4, 4);
1012                   break;
1013             }
1014          }
1015          y2 = 0;
1016          for(; y<4; ++y,++y2)
1017             memcpy(block+y*16, block+y2*16, 4*4);
1018          stb_compress_dxt_block(p, block, alpha, 10);
1019          p += alpha ? 16 : 8;
1020       }
1021    }
1022   // assert(p <= end);
1023 }
1024 
linearize(unsigned char inByte)1025 static inline unsigned char linearize(unsigned char inByte)
1026 {
1027     float srgbVal = ((float)inByte) / 255.0f;
1028     float linearVal;
1029 
1030     if(srgbVal < 0.04045)
1031         linearVal = srgbVal / 12.92f;
1032     else
1033         linearVal = pow( (srgbVal + 0.055f) / 1.055f, 2.4f);
1034 
1035     return (unsigned char)(floor(sqrt(linearVal)* 255.0 + 0.5));
1036 }
1037 
linearize(unsigned char * dst,const unsigned char * src,int n)1038 void linearize( unsigned char * dst, const unsigned char * src, int n )
1039 {
1040   n*=4;
1041   for( int i = 0; i < n; i++ )
1042     dst[i] = linearize(src[i]);
1043 }
1044 
1045 
1046 
1047 #endif // STB_DXT_IMPLEMENTATION
1048 
1049 #endif // STB_INCLUDE_STB_DXT_H
1050