1 // stb_dxt.h - Real-Time DXT1/DXT5 compressor
2 // Based on original by fabian "ryg" giesen v1.04
3 // Custom version, modified by Yann Collet
4 //
5 /*
6 BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
7
8 Redistribution and use in source and binary forms, with or without
9 modification, are permitted provided that the following conditions are
10 met:
11
12 * Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 * Redistributions in binary form must reproduce the above
15 copyright notice, this list of conditions and the following disclaimer
16 in the documentation and/or other materials provided with the
17 distribution.
18
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 You can contact the author at :
32 - RygsDXTc source repository : http://code.google.com/p/rygsdxtc/
33
34 */
35 // use '#define STB_DXT_IMPLEMENTATION' before including to create the implementation
36 //
37 // USAGE:
38 // call stb_compress_dxt_block() for every block (you must pad)
39 // source should be a 4x4 block of RGBA data in row-major order;
40 // A is ignored if you specify alpha=0; you can turn on dithering
41 // and "high quality" using mode.
42 //
43 // version history:
44 // v1.06 - (cyan) implement Fabian Giesen's comments
45 // v1.05 - (cyan) speed optimizations
46 // v1.04 - (ryg) default to no rounding bias for lerped colors (as per S3TC/DX10 spec);
47 // single color match fix (allow for inexact color interpolation);
48 // optimal DXT5 index finder; "high quality" mode that runs multiple refinement steps.
49 // v1.03 - (stb) endianness support
50 // v1.02 - (stb) fix alpha encoding bug
51 // v1.01 - (stb) fix bug converting to RGB that messed up quality, thanks ryg & cbloom
52 // v1.00 - (stb) first release
53
54 #ifndef STB_INCLUDE_STB_DXT_H
55 #define STB_INCLUDE_STB_DXT_H
56
57
58 //*******************************************************************
59 // Enable custom Optimisations
60 // Comment this define if you want to revert to ryg's original code
61 #define NEW_OPTIMISATIONS
62 //*******************************************************************
63
64 // compression mode (bitflags)
65 #define STB_DXT_NORMAL 0
66 #define STB_DXT_DITHER 1 // use dithering. dubious win. never use for normal maps and the like!
67 #define STB_DXT_HIGHQUAL 2 // high quality mode, does two refinement steps instead of 1. ~30-40% slower.
68
69 // The original signature has been modified by adding the parameter compressed_size which returns
70 // the size in bytes of the compressed data contained into dst
71 void rygCompress(unsigned char *dst, unsigned char *src, int w, int h, int isDxt5, int& compressed_size);
72
73 // TODO remove these, not working properly..
74 void rygCompressYCoCg( unsigned char *dst, unsigned char *src, int w, int h );
75 void linearize( unsigned char * dst, const unsigned char * src, int n );
76
77 void stb_compress_dxt_block(unsigned char *dest, const unsigned char *src, int alpha, int mode);
78 #define STB_COMPRESS_DXT_BLOCK
79
80 #ifdef STB_DXT_IMPLEMENTATION
81
82 // configuration options for DXT encoder. set them in the project/makefile or just define
83 // them at the top.
84
85 // STB_DXT_USE_ROUNDING_BIAS
86 // use a rounding bias during color interpolation. this is closer to what "ideal"
87 // interpolation would do but doesn't match the S3TC/DX10 spec. old versions (pre-1.03)
88 // implicitly had this turned on.
89 //
90 // in case you're targeting a specific type of hardware (e.g. console programmers):
91 // NVidia and Intel GPUs (as of 2010) as well as DX9 ref use DXT decoders that are closer
92 // to STB_DXT_USE_ROUNDING_BIAS. AMD/ATI, S3 and DX10 ref are closer to rounding with no bias.
93 // you also see "(a*5 + b*3) / 8" on some old GPU designs.
94 // #define STB_DXT_USE_ROUNDING_BIAS
95
96 #include <stdlib.h>
97 #include <math.h>
98 #include <stddef.h>
99 #include <string.h> // memset
100 #include <assert.h>
101 #include <iostream>
102 #include <algorithm>
103
104
105 static unsigned char stb__Expand5[32];
106 static unsigned char stb__Expand6[64];
107 static unsigned char stb__OMatch5[256][2];
108 static unsigned char stb__OMatch6[256][2];
109 static unsigned char stb__QuantRBTab[256+16];
110 static unsigned char stb__QuantGTab[256+16];
111
stb__Mul8Bit(int a,int b)112 static int stb__Mul8Bit(int a, int b)
113 {
114 int t = a*b + 128;
115 return (t + (t >> 8)) >> 8;
116 }
117
stb__From16Bit(unsigned char * out,unsigned short v)118 static void stb__From16Bit(unsigned char *out, unsigned short v)
119 {
120 int rv = (v & 0xf800) >> 11;
121 int gv = (v & 0x07e0) >> 5;
122 int bv = (v & 0x001f) >> 0;
123
124 out[0] = stb__Expand5[rv];
125 out[1] = stb__Expand6[gv];
126 out[2] = stb__Expand5[bv];
127 out[3] = 0;
128 }
129
stb__As16Bit(int r,int g,int b)130 static unsigned short stb__As16Bit(int r, int g, int b)
131 {
132 return (stb__Mul8Bit(r,31) << 11) + (stb__Mul8Bit(g,63) << 5) + stb__Mul8Bit(b,31);
133 }
134
135 // linear interpolation at 1/3 point between a and b, using desired rounding type
stb__Lerp13(int a,int b)136 static int stb__Lerp13(int a, int b)
137 {
138 #ifdef STB_DXT_USE_ROUNDING_BIAS
139 // with rounding bias
140 return a + stb__Mul8Bit(b-a, 0x55);
141 #else
142 // without rounding bias
143 // replace "/ 3" by "* 0xaaab) >> 17" if your compiler sucks or you really need every ounce of speed.
144 return (2*a + b) / 3;
145 #endif
146 }
147
148 // lerp RGB color
stb__Lerp13RGB(unsigned char * out,unsigned char * p1,unsigned char * p2)149 static void stb__Lerp13RGB(unsigned char *out, unsigned char *p1, unsigned char *p2)
150 {
151 out[0] = stb__Lerp13(p1[0], p2[0]);
152 out[1] = stb__Lerp13(p1[1], p2[1]);
153 out[2] = stb__Lerp13(p1[2], p2[2]);
154 }
155
156 /****************************************************************************/
157
158 // compute table to reproduce constant colors as accurately as possible
stb__PrepareOptTable(unsigned char * Table,const unsigned char * expand,int size)159 static void stb__PrepareOptTable(unsigned char *Table,const unsigned char *expand,int size)
160 {
161 int i,mn,mx;
162 for (i=0;i<256;i++) {
163 int bestErr = 256;
164 for (mn=0;mn<size;mn++) {
165 for (mx=0;mx<size;mx++) {
166 int mine = expand[mn];
167 int maxe = expand[mx];
168 int err = abs(stb__Lerp13(maxe, mine) - i);
169
170 // DX10 spec says that interpolation must be within 3% of "correct" result,
171 // add this as error term. (normally we'd expect a random distribution of
172 // +-1.5% error, but nowhere in the spec does it say that the error has to be
173 // unbiased - better safe than sorry).
174 err += abs(maxe - mine) * 3 / 100;
175
176 if(err < bestErr)
177 {
178 Table[i*2+0] = mx;
179 Table[i*2+1] = mn;
180 bestErr = err;
181 }
182 }
183 }
184 }
185 }
186
stb__EvalColors(unsigned char * color,unsigned short c0,unsigned short c1)187 static void stb__EvalColors(unsigned char *color,unsigned short c0,unsigned short c1)
188 {
189 stb__From16Bit(color+ 0, c0);
190 stb__From16Bit(color+ 4, c1);
191 stb__Lerp13RGB(color+ 8, color+0, color+4);
192 stb__Lerp13RGB(color+12, color+4, color+0);
193 }
194
195 // Block dithering function. Simply dithers a block to 565 RGB.
196 // (Floyd-Steinberg)
stb__DitherBlock(unsigned char * dest,unsigned char * block)197 static void stb__DitherBlock(unsigned char *dest, unsigned char *block)
198 {
199 int err[8],*ep1 = err,*ep2 = err+4, *et;
200 int ch,y;
201
202 // process channels seperately
203 for (ch=0; ch<3; ++ch) {
204 unsigned char *bp = block+ch, *dp = dest+ch;
205 unsigned char *quant = (ch == 1) ? stb__QuantGTab+8 : stb__QuantRBTab+8;
206 memset(err, 0, sizeof(err));
207 for(y=0; y<4; ++y) {
208 dp[ 0] = quant[bp[ 0] + ((3*ep2[1] + 5*ep2[0]) >> 4)];
209 ep1[0] = bp[ 0] - dp[ 0];
210 dp[ 4] = quant[bp[ 4] + ((7*ep1[0] + 3*ep2[2] + 5*ep2[1] + ep2[0]) >> 4)];
211 ep1[1] = bp[ 4] - dp[ 4];
212 dp[ 8] = quant[bp[ 8] + ((7*ep1[1] + 3*ep2[3] + 5*ep2[2] + ep2[1]) >> 4)];
213 ep1[2] = bp[ 8] - dp[ 8];
214 dp[12] = quant[bp[12] + ((7*ep1[2] + 5*ep2[3] + ep2[2]) >> 4)];
215 ep1[3] = bp[12] - dp[12];
216 bp += 16;
217 dp += 16;
218 et = ep1, ep1 = ep2, ep2 = et; // swap
219 }
220 }
221 }
222
223 // The color matching function
stb__MatchColorsBlock(unsigned char * block,unsigned char * color,int dither)224 static unsigned int stb__MatchColorsBlock(unsigned char *block, unsigned char *color,int dither)
225 {
226 unsigned int mask = 0;
227 int dirr = color[0*4+0] - color[1*4+0];
228 int dirg = color[0*4+1] - color[1*4+1];
229 int dirb = color[0*4+2] - color[1*4+2];
230 int dots[16];
231 int stops[4];
232 int i;
233 int c0Point, halfPoint, c3Point;
234
235 for(i=0;i<16;i++)
236 dots[i] = block[i*4+0]*dirr + block[i*4+1]*dirg + block[i*4+2]*dirb;
237
238 for(i=0;i<4;i++)
239 stops[i] = color[i*4+0]*dirr + color[i*4+1]*dirg + color[i*4+2]*dirb;
240
241 // think of the colors as arranged on a line; project point onto that line, then choose
242 // next color out of available ones. we compute the crossover points for "best color in top
243 // half"/"best in bottom half" and then the same inside that subinterval.
244 //
245 // relying on this 1d approximation isn't always optimal in terms of euclidean distance,
246 // but it's very close and a lot faster.
247 // http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html
248
249 c0Point = (stops[1] + stops[3]) >> 1;
250 halfPoint = (stops[3] + stops[2]) >> 1;
251 c3Point = (stops[2] + stops[0]) >> 1;
252
253 if(!dither)
254 {
255 // the version without dithering is straightforward
256
257 #ifdef NEW_OPTIMISATIONS
258 const int indexMap[8] = { 0 << 30,2 << 30,0 << 30,2 << 30,3 << 30,3 << 30,1 << 30,1 << 30 };
259
260 for(int i=0;i<16;i++)
261 {
262 int dot = dots[i];
263 mask >>= 2;
264
265 int bits =( (dot < halfPoint) ? 4 : 0 )
266 | ( (dot < c0Point) ? 2 : 0 )
267 | ( (dot < c3Point) ? 1 : 0 );
268
269 mask |= indexMap[bits];
270 }
271
272 #else
273 for (i=15;i>=0;i--) {
274 int dot = dots[i];
275 mask <<= 2;
276
277 if(dot < halfPoint)
278 mask |= (dot < c0Point) ? 1 : 3;
279 else
280 mask |= (dot < c3Point) ? 2 : 0;
281 }
282 #endif
283
284 } else {
285 // with floyd-steinberg dithering
286 int err[8],*ep1 = err,*ep2 = err+4;
287 int *dp = dots, y;
288
289 c0Point <<= 4;
290 halfPoint <<= 4;
291 c3Point <<= 4;
292 for(i=0;i<8;i++)
293 err[i] = 0;
294
295 for(y=0;y<4;y++)
296 {
297 int dot,lmask,step;
298
299 dot = (dp[0] << 4) + (3*ep2[1] + 5*ep2[0]);
300 if(dot < halfPoint)
301 step = (dot < c0Point) ? 1 : 3;
302 else
303 step = (dot < c3Point) ? 2 : 0;
304 ep1[0] = dp[0] - stops[step];
305 lmask = step;
306
307 dot = (dp[1] << 4) + (7*ep1[0] + 3*ep2[2] + 5*ep2[1] + ep2[0]);
308 if(dot < halfPoint)
309 step = (dot < c0Point) ? 1 : 3;
310 else
311 step = (dot < c3Point) ? 2 : 0;
312 ep1[1] = dp[1] - stops[step];
313 lmask |= step<<2;
314
315 dot = (dp[2] << 4) + (7*ep1[1] + 3*ep2[3] + 5*ep2[2] + ep2[1]);
316 if(dot < halfPoint)
317 step = (dot < c0Point) ? 1 : 3;
318 else
319 step = (dot < c3Point) ? 2 : 0;
320 ep1[2] = dp[2] - stops[step];
321 lmask |= step<<4;
322
323 dot = (dp[3] << 4) + (7*ep1[2] + 5*ep2[3] + ep2[2]);
324 if(dot < halfPoint)
325 step = (dot < c0Point) ? 1 : 3;
326 else
327 step = (dot < c3Point) ? 2 : 0;
328 ep1[3] = dp[3] - stops[step];
329 lmask |= step<<6;
330
331 dp += 4;
332 mask |= lmask << (y*8);
333 { int *et = ep1; ep1 = ep2; ep2 = et; } // swap
334 }
335 }
336
337 return mask;
338 }
339
340 // The color optimization function. (Clever code, part 1)
stb__OptimizeColorsBlock(unsigned char * block,unsigned short * pmax16,unsigned short * pmin16)341 static void stb__OptimizeColorsBlock(unsigned char *block, unsigned short *pmax16, unsigned short *pmin16)
342 {
343 unsigned char *minp, *maxp;
344 double magn;
345 int v_r,v_g,v_b;
346 static const int nIterPower = 4;
347 float covf[6],vfr,vfg,vfb;
348
349 // determine color distribution
350 int cov[6];
351 int mu[3],min[3],max[3];
352 int ch,i,iter;
353
354 for(ch=0;ch<3;ch++)
355 {
356 const unsigned char *bp = ((const unsigned char *) block) + ch;
357 int muv,minv,maxv;
358
359 #ifdef NEW_OPTIMISATIONS
360 # define MIN(a,b) (int)a + ( ((int)b-a) & ( ((int)b-a) >> 31 ) )
361 # define MAX(a,b) (int)a + ( ((int)b-a) & ( ((int)a-b) >> 31 ) )
362 # define RANGE(a,b,n) int min##n = MIN(a,b); int max##n = a+b - min##n; muv += a+b;
363 # define MINMAX(a,b,n) int min##n = MIN(min##a, min##b); int max##n = MAX(max##a, max##b);
364
365 muv = 0;
366 RANGE(bp[0], bp[4], 1);
367 RANGE(bp[8], bp[12], 2);
368 RANGE(bp[16], bp[20], 3);
369 RANGE(bp[24], bp[28], 4);
370 RANGE(bp[32], bp[36], 5);
371 RANGE(bp[40], bp[44], 6);
372 RANGE(bp[48], bp[52], 7);
373 RANGE(bp[56], bp[60], 8);
374
375 MINMAX(1,2,9);
376 MINMAX(3,4,10);
377 MINMAX(5,6,11);
378 MINMAX(7,8,12);
379
380 MINMAX(9,10,13);
381 MINMAX(11,12,14);
382
383 minv = MIN(min13,min14);
384 maxv = MAX(max13,max14);
385
386 #else
387 muv = minv = maxv = bp[0];
388 for(i=4;i<64;i+=4)
389 {
390 muv += bp[i];
391 if (bp[i] < minv) minv = bp[i];
392 else if (bp[i] > maxv) maxv = bp[i];
393 }
394 #endif
395
396 mu[ch] = (muv + 8) >> 4;
397 min[ch] = minv;
398 max[ch] = maxv;
399 }
400
401 // determine covariance matrix
402 for (i=0;i<6;i++)
403 cov[i] = 0;
404
405 for (i=0;i<16;i++)
406 {
407 int r = block[i*4+0] - mu[0];
408 int g = block[i*4+1] - mu[1];
409 int b = block[i*4+2] - mu[2];
410
411 cov[0] += r*r;
412 cov[1] += r*g;
413 cov[2] += r*b;
414 cov[3] += g*g;
415 cov[4] += g*b;
416 cov[5] += b*b;
417 }
418
419 // convert covariance matrix to float, find principal axis via power iter
420 for(i=0;i<6;i++)
421 covf[i] = cov[i] / 255.0f;
422
423 vfr = (float) (max[0] - min[0]);
424 vfg = (float) (max[1] - min[1]);
425 vfb = (float) (max[2] - min[2]);
426
427 for(iter=0;iter<nIterPower;iter++)
428 {
429 float r = vfr*covf[0] + vfg*covf[1] + vfb*covf[2];
430 float g = vfr*covf[1] + vfg*covf[3] + vfb*covf[4];
431 float b = vfr*covf[2] + vfg*covf[4] + vfb*covf[5];
432
433 vfr = r;
434 vfg = g;
435 vfb = b;
436 }
437
438 magn = fabs(vfr);
439 if (fabs(vfg) > magn) magn = fabs(vfg);
440 if (fabs(vfb) > magn) magn = fabs(vfb);
441
442 if(magn < 4.0f)
443 { // too small, default to luminance
444 v_r = 299; // JPEG YCbCr luma coefs, scaled by 1000.
445 v_g = 587;
446 v_b = 114;
447 } else {
448 magn = 512.0 / magn;
449 v_r = (int) (vfr * magn);
450 v_g = (int) (vfg * magn);
451 v_b = (int) (vfb * magn);
452 }
453
454
455 #ifdef NEW_OPTIMISATIONS
456 // Pick colors at extreme points
457 int mind, maxd;
458 mind = maxd = block[0]*v_r + block[1]*v_g + block[2]*v_b;
459 minp = maxp = block;
460 for(i=1;i<16;i++)
461 {
462 int dot = block[i*4+0]*v_r + block[i*4+1]*v_g + block[i*4+2]*v_b;
463
464 if (dot < mind) {
465 mind = dot;
466 minp = block+i*4;
467 continue;
468 }
469
470 if (dot > maxd) {
471 maxd = dot;
472 maxp = block+i*4;
473 }
474 }
475 #else
476 int mind = 0x7fffffff,maxd = -0x7fffffff;
477 // Pick colors at extreme points
478 for(i=0;i<16;i++)
479 {
480 int dot = block[i*4+0]*v_r + block[i*4+1]*v_g + block[i*4+2]*v_b;
481
482 if (dot < mind) {
483 mind = dot;
484 minp = block+i*4;
485 }
486
487 if (dot > maxd) {
488 maxd = dot;
489 maxp = block+i*4;
490 }
491 }
492 #endif
493
494 *pmax16 = stb__As16Bit(maxp[0],maxp[1],maxp[2]);
495 *pmin16 = stb__As16Bit(minp[0],minp[1],minp[2]);
496 }
497
stb__sclamp(float y,int p0,int p1)498 inline static int stb__sclamp(float y, int p0, int p1)
499 {
500 int x = (int) y;
501
502 #ifdef NEW_OPTIMISATIONS
503 x = x>p1 ? p1 : x;
504 return x<p0 ? p0 : x;
505 #else
506 if (x < p0) return p0;
507 if (x > p1) return p1;
508 return x;
509 #endif
510 }
511
512 // The refinement function. (Clever code, part 2)
513 // Tries to optimize colors to suit block contents better.
514 // (By solving a least squares system via normal equations+Cramer's rule)
stb__RefineBlock(unsigned char * block,unsigned short * pmax16,unsigned short * pmin16,unsigned int mask)515 static int stb__RefineBlock(unsigned char *block, unsigned short *pmax16, unsigned short *pmin16, unsigned int mask)
516 {
517 static const int w1Tab[4] = { 3,0,2,1 };
518 static const int prods[4] = { 0x090000,0x000900,0x040102,0x010402 };
519 // ^some magic to save a lot of multiplies in the accumulating loop...
520 // (precomputed products of weights for least squares system, accumulated inside one 32-bit register)
521
522 float frb,fg;
523 unsigned short oldMin, oldMax, min16, max16;
524 int i, akku = 0, xx,xy,yy;
525 int At1_r,At1_g,At1_b;
526 int At2_r,At2_g,At2_b;
527 unsigned int cm = mask;
528
529 oldMin = *pmin16;
530 oldMax = *pmax16;
531
532 if((mask ^ (mask<<2)) < 4) // all pixels have the same index?
533 {
534 // yes, linear system would be singular; solve using optimal
535 // single-color match on average color
536 int r = 8, g = 8, b = 8;
537 for (i=0;i<16;++i) {
538 r += block[i*4+0];
539 g += block[i*4+1];
540 b += block[i*4+2];
541 }
542
543 r >>= 4; g >>= 4; b >>= 4;
544
545 max16 = (stb__OMatch5[r][0]<<11) | (stb__OMatch6[g][0]<<5) | stb__OMatch5[b][0];
546 min16 = (stb__OMatch5[r][1]<<11) | (stb__OMatch6[g][1]<<5) | stb__OMatch5[b][1];
547 } else {
548 At1_r = At1_g = At1_b = 0;
549 At2_r = At2_g = At2_b = 0;
550 for (i=0;i<16;++i,cm>>=2)
551 {
552 int step = cm&3;
553 int w1 = w1Tab[step];
554 int r = block[i*4+0];
555 int g = block[i*4+1];
556 int b = block[i*4+2];
557
558 akku += prods[step];
559 At1_r += w1*r;
560 At1_g += w1*g;
561 At1_b += w1*b;
562 At2_r += r;
563 At2_g += g;
564 At2_b += b;
565 }
566
567 At2_r = 3*At2_r - At1_r;
568 At2_g = 3*At2_g - At1_g;
569 At2_b = 3*At2_b - At1_b;
570
571 // extract solutions and decide solvability
572 xx = akku >> 16;
573 yy = (akku >> 8) & 0xff;
574 xy = (akku >> 0) & 0xff;
575
576 frb = 3.0f * 31.0f / 255.0f / (xx*yy - xy*xy);
577 fg = frb * 63.0f / 31.0f;
578
579 // solve.
580 max16 = stb__sclamp((At1_r*yy - At2_r*xy)*frb+0.5f,0,31) << 11;
581 max16 |= stb__sclamp((At1_g*yy - At2_g*xy)*fg +0.5f,0,63) << 5;
582 max16 |= stb__sclamp((At1_b*yy - At2_b*xy)*frb+0.5f,0,31) << 0;
583
584 min16 = stb__sclamp((At2_r*xx - At1_r*xy)*frb+0.5f,0,31) << 11;
585 min16 |= stb__sclamp((At2_g*xx - At1_g*xy)*fg +0.5f,0,63) << 5;
586 min16 |= stb__sclamp((At2_b*xx - At1_b*xy)*frb+0.5f,0,31) << 0;
587 }
588
589 *pmin16 = min16;
590 *pmax16 = max16;
591 return oldMin != min16 || oldMax != max16;
592 }
593
594 // Color block compression
stb__CompressColorBlock(unsigned char * dest,unsigned char * block,int mode)595 static void stb__CompressColorBlock(unsigned char *dest, unsigned char *block, int mode)
596 {
597 unsigned int mask;
598 int i;
599 int dither;
600 int refinecount;
601 unsigned short max16, min16;
602 unsigned char dblock[16*4],color[4*4];
603
604 dither = mode & STB_DXT_DITHER;
605 refinecount = (mode & STB_DXT_HIGHQUAL) ? 2 : 1;
606
607 // check if block is constant
608 for (i=1;i<16;i++)
609 if (((unsigned int *) block)[i] != ((unsigned int *) block)[0])
610 break;
611
612 if(i == 16)
613 { // constant color
614 int r = block[0], g = block[1], b = block[2];
615 mask = 0xaaaaaaaa;
616 max16 = (stb__OMatch5[r][0]<<11) | (stb__OMatch6[g][0]<<5) | stb__OMatch5[b][0];
617 min16 = (stb__OMatch5[r][1]<<11) | (stb__OMatch6[g][1]<<5) | stb__OMatch5[b][1];
618 } else
619 {
620 // first step: compute dithered version for PCA if desired
621 if(dither)
622 stb__DitherBlock(dblock,block);
623
624 // second step: pca+map along principal axis
625 stb__OptimizeColorsBlock(dither ? dblock : block,&max16,&min16);
626 if (max16 != min16)
627 {
628 stb__EvalColors(color,max16,min16);
629 mask = stb__MatchColorsBlock(block,color,dither);
630 } else
631 mask = 0;
632
633 // third step: refine (multiple times if requested)
634 for (i=0;i<refinecount;i++) {
635 unsigned int lastmask = mask;
636
637 if (stb__RefineBlock(dither ? dblock : block,&max16,&min16,mask))
638 {
639 if (max16 != min16)
640 {
641 stb__EvalColors(color,max16,min16);
642 mask = stb__MatchColorsBlock(block,color,dither);
643 } else
644 {
645 mask = 0;
646 break;
647 }
648 }
649
650 if(mask == lastmask)
651 break;
652 }
653 }
654
655 // write the color block
656 if(max16 < min16)
657 {
658 unsigned short t = min16;
659 min16 = max16;
660 max16 = t;
661 mask ^= 0x55555555;
662 }
663
664 dest[0] = (unsigned char) (max16);
665 dest[1] = (unsigned char) (max16 >> 8);
666 dest[2] = (unsigned char) (min16);
667 dest[3] = (unsigned char) (min16 >> 8);
668 dest[4] = (unsigned char) (mask);
669 dest[5] = (unsigned char) (mask >> 8);
670 dest[6] = (unsigned char) (mask >> 16);
671 dest[7] = (unsigned char) (mask >> 24);
672 }
673
674 // Alpha block compression (this is easy for a change)
stb__CompressAlphaBlock(unsigned char * dest,unsigned char * src,int mode)675 static void stb__CompressAlphaBlock(unsigned char *dest,unsigned char *src,int mode)
676 {
677 int i,dist,bias,dist4,dist2,bits,mask;
678
679 // find min/max color
680 int mn,mx;
681
682 mn = mx = src[3];
683 for (i=1;i<16;i++)
684 {
685 if (src[i*4+3] < mn) mn = src[i*4+3];
686 else if (src[i*4+3] > mx) mx = src[i*4+3];
687 }
688
689 // encode them
690 ((unsigned char *)dest)[0] = mx;
691 ((unsigned char *)dest)[1] = mn;
692 dest += 2;
693
694 #ifdef NEW_OPTIMISATIONS
695 // mono-alpha shortcut
696 if (mn==mx)
697 {
698 *(unsigned short*)dest = 0;
699 dest += 2;
700 *(unsigned int*)dest = 0;
701 return;
702 }
703 #endif
704
705 // determine bias and emit color indices
706 // given the choice of mx/mn, these indices are optimal:
707 // http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
708 dist = mx-mn;
709 //printf("mn = %i; mx = %i; dist = %i\n", mn, mx, dist);
710 dist4 = dist*4;
711 dist2 = dist*2;
712 bias = (dist < 8) ? (dist - 1) : (dist/2 + 2);
713 bias -= mn * 7;
714 bits = 0, mask=0;
715
716 for (i=0;i<16;i++)
717 {
718 int a = src[i*4+3]*7 + bias;
719 int ind,t;
720
721 // select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
722 t = (a >= dist4) ? -1 : 0; ind = t & 4; a -= dist4 & t;
723 t = (a >= dist2) ? -1 : 0; ind += t & 2; a -= dist2 & t;
724 ind += (a >= dist);
725
726 // turn linear scale into DXT index (0/1 are extremal pts)
727 ind = -ind & 7;
728 ind ^= (2 > ind);
729
730 // write index
731 mask |= ind << bits;
732 if((bits += 3) >= 8)
733 {
734 *dest++ = mask;
735 mask >>= 8;
736 bits -= 8;
737 }
738 }
739 }
740
741
stb__InitDXT()742 static void stb__InitDXT()
743 {
744 int i;
745 for(i=0;i<32;i++)
746 stb__Expand5[i] = (i<<3)|(i>>2);
747
748 for(i=0;i<64;i++)
749 stb__Expand6[i] = (i<<2)|(i>>4);
750
751 for(i=0;i<256+16;i++)
752 {
753 int v = i-8 < 0 ? 0 : i-8 > 255 ? 255 : i-8;
754 stb__QuantRBTab[i] = stb__Expand5[stb__Mul8Bit(v,31)];
755 stb__QuantGTab[i] = stb__Expand6[stb__Mul8Bit(v,63)];
756 }
757
758 stb__PrepareOptTable(&stb__OMatch5[0][0],stb__Expand5,32);
759 stb__PrepareOptTable(&stb__OMatch6[0][0],stb__Expand6,64);
760 }
761
762
stb_compress_dxt_block(unsigned char * dest,const unsigned char * src,int alpha,int mode)763 void stb_compress_dxt_block(unsigned char *dest, const unsigned char *src, int alpha, int mode)
764 {
765 static int init=1;
766 if (init)
767 {
768 stb__InitDXT();
769 init=0;
770 }
771
772 if (alpha)
773 {
774 stb__CompressAlphaBlock(dest,(unsigned char*) src,mode);
775 dest += 8;
776 }
777
778 stb__CompressColorBlock(dest,(unsigned char*) src,mode);
779 }
780
imin(int x,int y)781 int imin(int x, int y) { return (x < y) ? x : y; }
782
783
784
785
786
extractBlock(const unsigned char * src,int x,int y,int w,int h,unsigned char * block)787 static void extractBlock(const unsigned char *src, int x, int y,
788 int w, int h, unsigned char *block)
789 {
790 int i, j;
791
792 #ifdef NEW_OPTIMISATIONS
793 if ((w-x >=4) && (h-y >=4))
794 {
795 // Full Square shortcut
796 src += x*4;
797 src += y*w*4;
798 for (i=0; i < 4; ++i)
799 {
800 *(unsigned int*)block = *(unsigned int*) src; block += 4; src += 4;
801 *(unsigned int*)block = *(unsigned int*) src; block += 4; src += 4;
802 *(unsigned int*)block = *(unsigned int*) src; block += 4; src += 4;
803 *(unsigned int*)block = *(unsigned int*) src; block += 4;
804 src += (w*4) - 12;
805 }
806 return;
807 }
808 #endif
809
810 int bw = imin(w - x, 4);
811 int bh = imin(h - y, 4);
812 int bx, by;
813
814 const int rem[] =
815 {
816 0, 0, 0, 0,
817 0, 1, 0, 1,
818 0, 1, 2, 0,
819 0, 1, 2, 3
820 };
821
822 for(i = 0; i < 4; ++i)
823 {
824 by = rem[(bh - 1) * 4 + i] + y;
825 for(j = 0; j < 4; ++j)
826 {
827 bx = rem[(bw - 1) * 4 + j] + x;
828 block[(i * 4 * 4) + (j * 4) + 0] =
829 src[(by * (w * 4)) + (bx * 4) + 0];
830 block[(i * 4 * 4) + (j * 4) + 1] =
831 src[(by * (w * 4)) + (bx * 4) + 1];
832 block[(i * 4 * 4) + (j * 4) + 2] =
833 src[(by * (w * 4)) + (bx * 4) + 2];
834 block[(i * 4 * 4) + (j * 4) + 3] =
835 src[(by * (w * 4)) + (bx * 4) + 3];
836 }
837 }
838 }
839
840 // should be a pretty optimized 0-255 clamper
clamp255(int n)841 inline static unsigned char clamp255( int n )
842 {
843 if( n > 255 ) n = 255;
844 if( n < 0 ) n = 0;
845 return n;
846 }
847
848
rgbToYCoCgBlock(unsigned char * dst,const unsigned char * src)849 void rgbToYCoCgBlock( unsigned char * dst, const unsigned char * src )
850 {
851 // Calculate Co and Cg extents
852 int extents = 0;
853 int n = 0;
854 int iY, iCo, iCg; //, r, g, b;
855 int blockCo[16];
856 int blockCg[16];
857 int i;
858
859 const unsigned char *px = src;
860 for(i=0;i<n;i++)
861 {
862 iCo = (px[0]<<1) - (px[2]<<1);
863 iCg = (px[1]<<1) - px[0] - px[2];
864 if(-iCo > extents) extents = -iCo;
865 if( iCo > extents) extents = iCo;
866 if(-iCg > extents) extents = -iCg;
867 if( iCg > extents) extents = iCg;
868
869 blockCo[n] = iCo;
870 blockCg[n++] = iCg;
871
872 px += 4;
873 }
874
875 // Co = -510..510
876 // Cg = -510..510
877 float scaleFactor = 1.0f;
878 if(extents > 127)
879 scaleFactor = (float)extents * 4.0f / 510.0f;
880
881 // Convert to quantized scalefactor
882 unsigned char scaleFactorQuantized = (unsigned char)(ceil((scaleFactor - 1.0f) * 31.0f / 3.0f));
883
884 // Unquantize
885 scaleFactor = 1.0f + (float)(scaleFactorQuantized / 31.0f) * 3.0f;
886
887 unsigned char bVal = (unsigned char)((scaleFactorQuantized << 3) | (scaleFactorQuantized >> 2));
888
889 unsigned char *outPx = dst;
890
891 n = 0;
892 px = src;
893 /*
894 for(i=0;i<16;i++)
895 {
896 // Calculate components
897 iY = ( px[0] + (px[1]<<1) + px[2] + 2 ) / 4;
898 iCo = ((blockCo[n] / scaleFactor) + 128);
899 iCg = ((blockCg[n] / scaleFactor) + 128);
900
901 if(iCo < 0) iCo = 0; else if(iCo > 255) iCo = 255;
902 if(iCg < 0) iCg = 0; else if(iCg > 255) iCg = 255;
903 if(iY < 0) iY = 0; else if(iY > 255) iY = 255;
904
905 px += 4;
906
907 outPx[0] = (unsigned char)iCo;
908 outPx[1] = (unsigned char)iCg;
909 outPx[2] = bVal;
910 outPx[3] = (unsigned char)iY;
911
912 outPx += 4;
913 }*/
914 for(i=0;i<16;i++)
915 {
916 // Calculate components
917 int r = px[0];
918 int g = (px[1] + 1) >> 1;
919 int b = px[2];
920 int tmp = (2 + r + b) >> 2;
921
922 // Co
923 iCo = clamp255( 128 + ((r - b + 1) >> 1) );
924 // Y
925 iY = clamp255( g + tmp );
926 // Cg
927 iCg = clamp255( 128 + g - tmp );
928
929 px += 4;
930
931 outPx[0] = (unsigned char)iCo;
932 outPx[1] = (unsigned char)iCg;
933 outPx[2] = bVal;
934 outPx[3] = (unsigned char)iY;
935
936 outPx += 4;
937 }
938
939 }
940
941
rygCompress(unsigned char * dst,unsigned char * src,int w,int h,int isDxt5,int & compressed_size)942 void rygCompress(unsigned char *dst, unsigned char *src, int w, int h, int isDxt5, int& compressed_size)
943 {
944
945 unsigned char block[64];
946 int x, y;
947
948 unsigned char* initial_dst = dst;
949
950 for (y = 0; y < h; y += 4)
951 {
952 for(x = 0; x < w; x += 4)
953 {
954 extractBlock(src, x, y, w, h, block);
955 stb_compress_dxt_block(dst, block, isDxt5, STB_DXT_NORMAL);
956 dst += isDxt5 ? 16 : 8;
957 }
958 }
959
960 compressed_size = dst - initial_dst;
961 }
962
rygCompressYCoCg(unsigned char * dst,unsigned char * src,int w,int h)963 void rygCompressYCoCg( unsigned char *dst, unsigned char *src, int w, int h )
964 {
965 unsigned char block[64];
966 unsigned char ycocgblock[64];
967 int x, y;
968
969 for(y = 0; y < h; y += 4)
970 {
971 for(x = 0; x < w; x += 4)
972 {
973 extractBlock(src, x, y, w, h, block);
974 rgbToYCoCgBlock(ycocgblock,block);
975 stb_compress_dxt_block(dst, ycocgblock, 1, 10);
976 dst += 16;
977 }
978 }
979
980 }
981
stbgl__compress(unsigned char * p,unsigned char * rgba,int w,int h,int isDxt5)982 static void stbgl__compress(unsigned char *p, unsigned char *rgba, int w, int h, int isDxt5)
983 {
984 int i,j,y,y2;
985 int alpha = isDxt5;
986
987 for (j=0; j < w; j += 4) {
988 int x=4;
989 for (i=0; i < h; i += 4) {
990 unsigned char block[16*4];
991 if (i+3 >= w) x = w-i;
992 for (y=0; y < 4; ++y) {
993 if (j+y >= h) break;
994 memcpy(block+y*16, rgba + w*4*(j+y) + i*4, x*4);
995 }
996 if (x < 4) {
997 switch (x) {
998 case 0: assert(0);
999 case 1:
1000 for (y2=0; y2 < y; ++y2) {
1001 memcpy(block+y2*16+1*4, block+y2*16+0*4, 4);
1002 memcpy(block+y2*16+2*4, block+y2*16+0*4, 8);
1003 }
1004 break;
1005 case 2:
1006 for (y2=0; y2 < y; ++y2)
1007 memcpy(block+y2*16+2*4, block+y2*16+0*4, 8);
1008 break;
1009 case 3:
1010 for (y2=0; y2 < y; ++y2)
1011 memcpy(block+y2*16+3*4, block+y2*16+1*4, 4);
1012 break;
1013 }
1014 }
1015 y2 = 0;
1016 for(; y<4; ++y,++y2)
1017 memcpy(block+y*16, block+y2*16, 4*4);
1018 stb_compress_dxt_block(p, block, alpha, 10);
1019 p += alpha ? 16 : 8;
1020 }
1021 }
1022 // assert(p <= end);
1023 }
1024
linearize(unsigned char inByte)1025 static inline unsigned char linearize(unsigned char inByte)
1026 {
1027 float srgbVal = ((float)inByte) / 255.0f;
1028 float linearVal;
1029
1030 if(srgbVal < 0.04045)
1031 linearVal = srgbVal / 12.92f;
1032 else
1033 linearVal = pow( (srgbVal + 0.055f) / 1.055f, 2.4f);
1034
1035 return (unsigned char)(floor(sqrt(linearVal)* 255.0 + 0.5));
1036 }
1037
linearize(unsigned char * dst,const unsigned char * src,int n)1038 void linearize( unsigned char * dst, const unsigned char * src, int n )
1039 {
1040 n*=4;
1041 for( int i = 0; i < n; i++ )
1042 dst[i] = linearize(src[i]);
1043 }
1044
1045
1046
1047 #endif // STB_DXT_IMPLEMENTATION
1048
1049 #endif // STB_INCLUDE_STB_DXT_H
1050