1 /**************************************************************************
2  *
3  * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
4  * Copyright (c) 2008 VMware, Inc.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included
14  * in all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  **************************************************************************/
25 
26 #include "util/format/u_format.h"
27 #include "util/format/u_format_fxt1.h"
28 #include "util/format/u_format_pack.h"
29 #include "util/format_srgb.h"
30 #include "util/u_math.h"
31 
32 #define RCOMP 0
33 #define GCOMP 1
34 #define BCOMP 2
35 #define ACOMP 3
36 
37 #define FXT1_BLOCK_SIZE 16
38 
39 static void
40 fxt1_encode (uint32_t width, uint32_t height, int32_t comps,
41              const void *source, int32_t srcRowStride,
42              void *dest, int32_t destRowStride);
43 
44 static void
45 fxt1_decode_1 (const void *texture, int32_t stride,
46                int32_t i, int32_t j, uint8_t *rgba);
47 
48 /***************************************************************************\
49  * FXT1 encoder
50  *
51  * The encoder was built by reversing the decoder,
52  * and is vaguely based on Texus2 by 3dfx. Note that this code
53  * is merely a proof of concept, since it is highly UNoptimized;
54  * moreover, it is sub-optimal due to initial conditions passed
55  * to Lloyd's algorithm (the interpolation modes are even worse).
56 \***************************************************************************/
57 
58 
59 #define MAX_COMP 4 /* ever needed maximum number of components in texel */
60 #define MAX_VECT 4 /* ever needed maximum number of base vectors to find */
61 #define N_TEXELS 32 /* number of texels in a block (always 32) */
62 #define LL_N_REP 50 /* number of iterations in lloyd's vq */
63 #define LL_RMS_D 10 /* fault tolerance (maximum delta) */
64 #define LL_RMS_E 255 /* fault tolerance (maximum error) */
65 #define ALPHA_TS 2 /* alpha threshold: (255 - ALPHA_TS) deemed opaque */
66 static const uint32_t zero = 0;
67 #define ISTBLACK(v) (memcmp(&(v), &zero, sizeof(zero)) == 0)
68 
69 /*
70  * Define a 64-bit unsigned integer type and macros
71  */
72 #if 1
73 
74 #define FX64_NATIVE 1
75 
76 typedef uint64_t Fx64;
77 
78 #define FX64_MOV32(a, b) a = b
79 #define FX64_OR32(a, b)  a |= b
80 #define FX64_SHL(a, c)   a <<= c
81 
82 #else
83 
84 #define FX64_NATIVE 0
85 
86 typedef struct {
87    uint32_t lo, hi;
88 } Fx64;
89 
90 #define FX64_MOV32(a, b) a.lo = b
91 #define FX64_OR32(a, b)  a.lo |= b
92 
93 #define FX64_SHL(a, c)                                 \
94    do {                                                \
95        if ((c) >= 32) {                                \
96           a.hi = a.lo << ((c) - 32);                   \
97           a.lo = 0;                                    \
98        } else {                                        \
99           a.hi = (a.hi << (c)) | (a.lo >> (32 - (c))); \
100           a.lo <<= (c);                                \
101        }                                               \
102    } while (0)
103 
104 #endif
105 
106 
107 #define F(i) (float)1 /* can be used to obtain an oblong metric: 0.30 / 0.59 / 0.11 */
108 #define SAFECDOT 1 /* for paranoids */
109 
110 #define MAKEIVEC(NV, NC, IV, B, V0, V1)  \
111    do {                                  \
112       /* compute interpolation vector */ \
113       float d2 = 0.0F;                   \
114       float rd2;                         \
115                                          \
116       for (i = 0; i < NC; i++) {         \
117          IV[i] = (V1[i] - V0[i]) * F(i); \
118          d2 += IV[i] * IV[i];            \
119       }                                  \
120       rd2 = (float)NV / d2;              \
121       B = 0;                             \
122       for (i = 0; i < NC; i++) {         \
123          IV[i] *= F(i);                  \
124          B -= IV[i] * V0[i];             \
125          IV[i] *= rd2;                   \
126       }                                  \
127       B = B * rd2 + 0.5f;                \
128    } while (0)
129 
130 #define CALCCDOT(TEXEL, NV, NC, IV, B, V)\
131    do {                                  \
132       float dot = 0.0F;                  \
133       for (i = 0; i < NC; i++) {         \
134          dot += V[i] * IV[i];            \
135       }                                  \
136       TEXEL = (int32_t)(dot + B);        \
137       if (SAFECDOT) {                    \
138          if (TEXEL < 0) {                \
139             TEXEL = 0;                   \
140          } else if (TEXEL > NV) {        \
141             TEXEL = NV;                  \
142          }                               \
143       }                                  \
144    } while (0)
145 
146 
147 static int32_t
fxt1_bestcol(float vec[][MAX_COMP],int32_t nv,uint8_t input[MAX_COMP],int32_t nc)148 fxt1_bestcol (float vec[][MAX_COMP], int32_t nv,
149               uint8_t input[MAX_COMP], int32_t nc)
150 {
151    int32_t i, j, best = -1;
152    float err = 1e9; /* big enough */
153 
154    for (j = 0; j < nv; j++) {
155       float e = 0.0F;
156       for (i = 0; i < nc; i++) {
157          e += (vec[j][i] - input[i]) * (vec[j][i] - input[i]);
158       }
159       if (e < err) {
160          err = e;
161          best = j;
162       }
163    }
164 
165    return best;
166 }
167 
168 
169 static int32_t
fxt1_worst(float vec[MAX_COMP],uint8_t input[N_TEXELS][MAX_COMP],int32_t nc,int32_t n)170 fxt1_worst (float vec[MAX_COMP],
171             uint8_t input[N_TEXELS][MAX_COMP], int32_t nc, int32_t n)
172 {
173    int32_t i, k, worst = -1;
174    float err = -1.0F; /* small enough */
175 
176    for (k = 0; k < n; k++) {
177       float e = 0.0F;
178       for (i = 0; i < nc; i++) {
179          e += (vec[i] - input[k][i]) * (vec[i] - input[k][i]);
180       }
181       if (e > err) {
182          err = e;
183          worst = k;
184       }
185    }
186 
187    return worst;
188 }
189 
190 
191 static int32_t
fxt1_variance(uint8_t input[N_TEXELS/2][MAX_COMP],int32_t nc)192 fxt1_variance (uint8_t input[N_TEXELS / 2][MAX_COMP], int32_t nc)
193 {
194    const int n = N_TEXELS / 2;
195    int32_t i, k, best = 0;
196    int32_t sx, sx2;
197    double var, maxvar = -1; /* small enough */
198    double teenth = 1.0 / n;
199 
200    for (i = 0; i < nc; i++) {
201       sx = sx2 = 0;
202       for (k = 0; k < n; k++) {
203          int32_t t = input[k][i];
204          sx += t;
205          sx2 += t * t;
206       }
207       var = sx2 * teenth - sx * sx * teenth * teenth;
208       if (maxvar < var) {
209          maxvar = var;
210          best = i;
211       }
212    }
213 
214    return best;
215 }
216 
217 
218 static int32_t
fxt1_choose(float vec[][MAX_COMP],int32_t nv,uint8_t input[N_TEXELS][MAX_COMP],int32_t nc,int32_t n)219 fxt1_choose (float vec[][MAX_COMP], int32_t nv,
220              uint8_t input[N_TEXELS][MAX_COMP], int32_t nc, int32_t n)
221 {
222 #if 0
223    /* Choose colors from a grid.
224     */
225    int32_t i, j;
226 
227    for (j = 0; j < nv; j++) {
228       int32_t m = j * (n - 1) / (nv - 1);
229       for (i = 0; i < nc; i++) {
230          vec[j][i] = input[m][i];
231       }
232    }
233 #else
234    /* Our solution here is to find the darkest and brightest colors in
235     * the 8x4 tile and use those as the two representative colors.
236     * There are probably better algorithms to use (histogram-based).
237     */
238    int32_t i, j, k;
239    int32_t minSum = 2000; /* big enough */
240    int32_t maxSum = -1; /* small enough */
241    int32_t minCol = 0; /* phoudoin: silent compiler! */
242    int32_t maxCol = 0; /* phoudoin: silent compiler! */
243 
244    struct {
245       int32_t flag;
246       int32_t key;
247       int32_t freq;
248       int32_t idx;
249    } hist[N_TEXELS];
250    int32_t lenh = 0;
251 
252    memset(hist, 0, sizeof(hist));
253 
254    for (k = 0; k < n; k++) {
255       int32_t l;
256       int32_t key = 0;
257       int32_t sum = 0;
258       for (i = 0; i < nc; i++) {
259          key <<= 8;
260          key |= input[k][i];
261          sum += input[k][i];
262       }
263       for (l = 0; l < n; l++) {
264          if (!hist[l].flag) {
265             /* alloc new slot */
266             hist[l].flag = !0;
267             hist[l].key = key;
268             hist[l].freq = 1;
269             hist[l].idx = k;
270             lenh = l + 1;
271             break;
272          } else if (hist[l].key == key) {
273             hist[l].freq++;
274             break;
275          }
276       }
277       if (minSum > sum) {
278          minSum = sum;
279          minCol = k;
280       }
281       if (maxSum < sum) {
282          maxSum = sum;
283          maxCol = k;
284       }
285    }
286 
287    if (lenh <= nv) {
288       for (j = 0; j < lenh; j++) {
289          for (i = 0; i < nc; i++) {
290             vec[j][i] = (float)input[hist[j].idx][i];
291          }
292       }
293       for (; j < nv; j++) {
294          for (i = 0; i < nc; i++) {
295             vec[j][i] = vec[0][i];
296          }
297       }
298       return 0;
299    }
300 
301    for (j = 0; j < nv; j++) {
302       for (i = 0; i < nc; i++) {
303          vec[j][i] = ((nv - 1 - j) * input[minCol][i] + j * input[maxCol][i] + (nv - 1) / 2) / (float)(nv - 1);
304       }
305    }
306 #endif
307 
308    return !0;
309 }
310 
311 
312 static int32_t
fxt1_lloyd(float vec[][MAX_COMP],int32_t nv,uint8_t input[N_TEXELS][MAX_COMP],int32_t nc,int32_t n)313 fxt1_lloyd (float vec[][MAX_COMP], int32_t nv,
314             uint8_t input[N_TEXELS][MAX_COMP], int32_t nc, int32_t n)
315 {
316    /* Use the generalized lloyd's algorithm for VQ:
317     *     find 4 color vectors.
318     *
319     *     for each sample color
320     *         sort to nearest vector.
321     *
322     *     replace each vector with the centroid of its matching colors.
323     *
324     *     repeat until RMS doesn't improve.
325     *
326     *     if a color vector has no samples, or becomes the same as another
327     *     vector, replace it with the color which is farthest from a sample.
328     *
329     * vec[][MAX_COMP]           initial vectors and resulting colors
330     * nv                        number of resulting colors required
331     * input[N_TEXELS][MAX_COMP] input texels
332     * nc                        number of components in input / vec
333     * n                         number of input samples
334     */
335 
336    int32_t sum[MAX_VECT][MAX_COMP]; /* used to accumulate closest texels */
337    int32_t cnt[MAX_VECT]; /* how many times a certain vector was chosen */
338    float error, lasterror = 1e9;
339 
340    int32_t i, j, k, rep;
341 
342    /* the quantizer */
343    for (rep = 0; rep < LL_N_REP; rep++) {
344       /* reset sums & counters */
345       for (j = 0; j < nv; j++) {
346          for (i = 0; i < nc; i++) {
347             sum[j][i] = 0;
348          }
349          cnt[j] = 0;
350       }
351       error = 0;
352 
353       /* scan whole block */
354       for (k = 0; k < n; k++) {
355 #if 1
356          int32_t best = -1;
357          float err = 1e9; /* big enough */
358          /* determine best vector */
359          for (j = 0; j < nv; j++) {
360             float e = (vec[j][0] - input[k][0]) * (vec[j][0] - input[k][0]) +
361                       (vec[j][1] - input[k][1]) * (vec[j][1] - input[k][1]) +
362                       (vec[j][2] - input[k][2]) * (vec[j][2] - input[k][2]);
363             if (nc == 4) {
364                e += (vec[j][3] - input[k][3]) * (vec[j][3] - input[k][3]);
365             }
366             if (e < err) {
367                err = e;
368                best = j;
369             }
370          }
371 #else
372          int32_t best = fxt1_bestcol(vec, nv, input[k], nc, &err);
373 #endif
374          assert(best >= 0);
375          /* add in closest color */
376          for (i = 0; i < nc; i++) {
377             sum[best][i] += input[k][i];
378          }
379          /* mark this vector as used */
380          cnt[best]++;
381          /* accumulate error */
382          error += err;
383       }
384 
385       /* check RMS */
386       if ((error < LL_RMS_E) ||
387           ((error < lasterror) && ((lasterror - error) < LL_RMS_D))) {
388          return !0; /* good match */
389       }
390       lasterror = error;
391 
392       /* move each vector to the barycenter of its closest colors */
393       for (j = 0; j < nv; j++) {
394          if (cnt[j]) {
395             float div = 1.0F / cnt[j];
396             for (i = 0; i < nc; i++) {
397                vec[j][i] = div * sum[j][i];
398             }
399          } else {
400             /* this vec has no samples or is identical with a previous vec */
401             int32_t worst = fxt1_worst(vec[j], input, nc, n);
402             for (i = 0; i < nc; i++) {
403                vec[j][i] = input[worst][i];
404             }
405          }
406       }
407    }
408 
409    return 0; /* could not converge fast enough */
410 }
411 
412 
413 static void
fxt1_quantize_CHROMA(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP])414 fxt1_quantize_CHROMA (uint32_t *cc,
415                       uint8_t input[N_TEXELS][MAX_COMP])
416 {
417    const int32_t n_vect = 4; /* 4 base vectors to find */
418    const int32_t n_comp = 3; /* 3 components: R, G, B */
419    float vec[MAX_VECT][MAX_COMP];
420    int32_t i, j, k;
421    Fx64 hi; /* high quadword */
422    uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
423 
424    if (fxt1_choose(vec, n_vect, input, n_comp, N_TEXELS) != 0) {
425       fxt1_lloyd(vec, n_vect, input, n_comp, N_TEXELS);
426    }
427 
428    FX64_MOV32(hi, 4); /* cc-chroma = "010" + unused bit */
429    for (j = n_vect - 1; j >= 0; j--) {
430       for (i = 0; i < n_comp; i++) {
431          /* add in colors */
432          FX64_SHL(hi, 5);
433          FX64_OR32(hi, (uint32_t)(vec[j][i] / 8.0F));
434       }
435    }
436    ((Fx64 *)cc)[1] = hi;
437 
438    lohi = lolo = 0;
439    /* right microtile */
440    for (k = N_TEXELS - 1; k >= N_TEXELS/2; k--) {
441       lohi <<= 2;
442       lohi |= fxt1_bestcol(vec, n_vect, input[k], n_comp);
443    }
444    /* left microtile */
445    for (; k >= 0; k--) {
446       lolo <<= 2;
447       lolo |= fxt1_bestcol(vec, n_vect, input[k], n_comp);
448    }
449    cc[1] = lohi;
450    cc[0] = lolo;
451 }
452 
453 
454 static void
fxt1_quantize_ALPHA0(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP],uint8_t reord[N_TEXELS][MAX_COMP],int32_t n)455 fxt1_quantize_ALPHA0 (uint32_t *cc,
456                       uint8_t input[N_TEXELS][MAX_COMP],
457                       uint8_t reord[N_TEXELS][MAX_COMP], int32_t n)
458 {
459    const int32_t n_vect = 3; /* 3 base vectors to find */
460    const int32_t n_comp = 4; /* 4 components: R, G, B, A */
461    float vec[MAX_VECT][MAX_COMP];
462    int32_t i, j, k;
463    Fx64 hi; /* high quadword */
464    uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
465 
466    /* the last vector indicates zero */
467    for (i = 0; i < n_comp; i++) {
468       vec[n_vect][i] = 0;
469    }
470 
471    /* the first n texels in reord are guaranteed to be non-zero */
472    if (fxt1_choose(vec, n_vect, reord, n_comp, n) != 0) {
473       fxt1_lloyd(vec, n_vect, reord, n_comp, n);
474    }
475 
476    FX64_MOV32(hi, 6); /* alpha = "011" + lerp = 0 */
477    for (j = n_vect - 1; j >= 0; j--) {
478       /* add in alphas */
479       FX64_SHL(hi, 5);
480       FX64_OR32(hi, (uint32_t)(vec[j][ACOMP] / 8.0F));
481    }
482    for (j = n_vect - 1; j >= 0; j--) {
483       for (i = 0; i < n_comp - 1; i++) {
484          /* add in colors */
485          FX64_SHL(hi, 5);
486          FX64_OR32(hi, (uint32_t)(vec[j][i] / 8.0F));
487       }
488    }
489    ((Fx64 *)cc)[1] = hi;
490 
491    lohi = lolo = 0;
492    /* right microtile */
493    for (k = N_TEXELS - 1; k >= N_TEXELS/2; k--) {
494       lohi <<= 2;
495       lohi |= fxt1_bestcol(vec, n_vect + 1, input[k], n_comp);
496    }
497    /* left microtile */
498    for (; k >= 0; k--) {
499       lolo <<= 2;
500       lolo |= fxt1_bestcol(vec, n_vect + 1, input[k], n_comp);
501    }
502    cc[1] = lohi;
503    cc[0] = lolo;
504 }
505 
506 
507 static void
fxt1_quantize_ALPHA1(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP])508 fxt1_quantize_ALPHA1 (uint32_t *cc,
509                       uint8_t input[N_TEXELS][MAX_COMP])
510 {
511    const int32_t n_vect = 3; /* highest vector number in each microtile */
512    const int32_t n_comp = 4; /* 4 components: R, G, B, A */
513    float vec[1 + 1 + 1][MAX_COMP]; /* 1.5 extrema for each sub-block */
514    float b, iv[MAX_COMP]; /* interpolation vector */
515    int32_t i, j, k;
516    Fx64 hi; /* high quadword */
517    uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
518 
519    int32_t minSum;
520    int32_t maxSum;
521    int32_t minColL = 0, maxColL = 0;
522    int32_t minColR = 0, maxColR = 0;
523    int32_t sumL = 0, sumR = 0;
524    int32_t nn_comp;
525    /* Our solution here is to find the darkest and brightest colors in
526     * the 4x4 tile and use those as the two representative colors.
527     * There are probably better algorithms to use (histogram-based).
528     */
529    nn_comp = n_comp;
530    while ((minColL == maxColL) && nn_comp) {
531        minSum = 2000; /* big enough */
532        maxSum = -1; /* small enough */
533        for (k = 0; k < N_TEXELS / 2; k++) {
534            int32_t sum = 0;
535            for (i = 0; i < nn_comp; i++) {
536                sum += input[k][i];
537            }
538            if (minSum > sum) {
539                minSum = sum;
540                minColL = k;
541            }
542            if (maxSum < sum) {
543                maxSum = sum;
544                maxColL = k;
545            }
546            sumL += sum;
547        }
548 
549        nn_comp--;
550    }
551 
552    nn_comp = n_comp;
553    while ((minColR == maxColR) && nn_comp) {
554        minSum = 2000; /* big enough */
555        maxSum = -1; /* small enough */
556        for (k = N_TEXELS / 2; k < N_TEXELS; k++) {
557            int32_t sum = 0;
558            for (i = 0; i < nn_comp; i++) {
559                sum += input[k][i];
560            }
561            if (minSum > sum) {
562                minSum = sum;
563                minColR = k;
564            }
565            if (maxSum < sum) {
566                maxSum = sum;
567                maxColR = k;
568            }
569            sumR += sum;
570        }
571 
572        nn_comp--;
573    }
574 
575    /* choose the common vector (yuck!) */
576    {
577       int32_t j1, j2;
578       int32_t v1 = 0, v2 = 0;
579       float err = 1e9; /* big enough */
580       float tv[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
581       for (i = 0; i < n_comp; i++) {
582          tv[0][i] = input[minColL][i];
583          tv[1][i] = input[maxColL][i];
584          tv[2][i] = input[minColR][i];
585          tv[3][i] = input[maxColR][i];
586       }
587       for (j1 = 0; j1 < 2; j1++) {
588          for (j2 = 2; j2 < 4; j2++) {
589             float e = 0.0F;
590             for (i = 0; i < n_comp; i++) {
591                e += (tv[j1][i] - tv[j2][i]) * (tv[j1][i] - tv[j2][i]);
592             }
593             if (e < err) {
594                err = e;
595                v1 = j1;
596                v2 = j2;
597             }
598          }
599       }
600       for (i = 0; i < n_comp; i++) {
601          vec[0][i] = tv[1 - v1][i];
602          vec[1][i] = (tv[v1][i] * sumL + tv[v2][i] * sumR) / (sumL + sumR);
603          vec[2][i] = tv[5 - v2][i];
604       }
605    }
606 
607    /* left microtile */
608    cc[0] = 0;
609    if (minColL != maxColL) {
610       /* compute interpolation vector */
611       MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
612 
613       /* add in texels */
614       lolo = 0;
615       for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
616          int32_t texel;
617          /* interpolate color */
618          CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
619          /* add in texel */
620          lolo <<= 2;
621          lolo |= texel;
622       }
623 
624       cc[0] = lolo;
625    }
626 
627    /* right microtile */
628    cc[1] = 0;
629    if (minColR != maxColR) {
630       /* compute interpolation vector */
631       MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[1]);
632 
633       /* add in texels */
634       lohi = 0;
635       for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
636          int32_t texel;
637          /* interpolate color */
638          CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
639          /* add in texel */
640          lohi <<= 2;
641          lohi |= texel;
642       }
643 
644       cc[1] = lohi;
645    }
646 
647    FX64_MOV32(hi, 7); /* alpha = "011" + lerp = 1 */
648    for (j = n_vect - 1; j >= 0; j--) {
649       /* add in alphas */
650       FX64_SHL(hi, 5);
651       FX64_OR32(hi, (uint32_t)(vec[j][ACOMP] / 8.0F));
652    }
653    for (j = n_vect - 1; j >= 0; j--) {
654       for (i = 0; i < n_comp - 1; i++) {
655          /* add in colors */
656          FX64_SHL(hi, 5);
657          FX64_OR32(hi, (uint32_t)(vec[j][i] / 8.0F));
658       }
659    }
660    ((Fx64 *)cc)[1] = hi;
661 }
662 
663 
664 static void
fxt1_quantize_HI(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP],uint8_t reord[N_TEXELS][MAX_COMP],int32_t n)665 fxt1_quantize_HI (uint32_t *cc,
666                   uint8_t input[N_TEXELS][MAX_COMP],
667                   uint8_t reord[N_TEXELS][MAX_COMP], int32_t n)
668 {
669    const int32_t n_vect = 6; /* highest vector number */
670    const int32_t n_comp = 3; /* 3 components: R, G, B */
671    float b = 0.0F;       /* phoudoin: silent compiler! */
672    float iv[MAX_COMP];   /* interpolation vector */
673    int32_t i, k;
674    uint32_t hihi; /* high quadword: hi dword */
675 
676    int32_t minSum = 2000; /* big enough */
677    int32_t maxSum = -1; /* small enough */
678    int32_t minCol = 0; /* phoudoin: silent compiler! */
679    int32_t maxCol = 0; /* phoudoin: silent compiler! */
680 
681    /* Our solution here is to find the darkest and brightest colors in
682     * the 8x4 tile and use those as the two representative colors.
683     * There are probably better algorithms to use (histogram-based).
684     */
685    for (k = 0; k < n; k++) {
686       int32_t sum = 0;
687       for (i = 0; i < n_comp; i++) {
688          sum += reord[k][i];
689       }
690       if (minSum > sum) {
691          minSum = sum;
692          minCol = k;
693       }
694       if (maxSum < sum) {
695          maxSum = sum;
696          maxCol = k;
697       }
698    }
699 
700    hihi = 0; /* cc-hi = "00" */
701    for (i = 0; i < n_comp; i++) {
702       /* add in colors */
703       hihi <<= 5;
704       hihi |= reord[maxCol][i] >> 3;
705    }
706    for (i = 0; i < n_comp; i++) {
707       /* add in colors */
708       hihi <<= 5;
709       hihi |= reord[minCol][i] >> 3;
710    }
711    cc[3] = hihi;
712    cc[0] = cc[1] = cc[2] = 0;
713 
714    /* compute interpolation vector */
715    if (minCol != maxCol) {
716       MAKEIVEC(n_vect, n_comp, iv, b, reord[minCol], reord[maxCol]);
717    }
718 
719    /* add in texels */
720    for (k = N_TEXELS - 1; k >= 0; k--) {
721       int32_t t = k * 3;
722       uint32_t *kk = (uint32_t *)((char *)cc + t / 8);
723       int32_t texel = n_vect + 1; /* transparent black */
724 
725       if (!ISTBLACK(input[k])) {
726          if (minCol != maxCol) {
727             /* interpolate color */
728             CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
729             /* add in texel */
730             kk[0] |= texel << (t & 7);
731          }
732       } else {
733          /* add in texel */
734          kk[0] |= texel << (t & 7);
735       }
736    }
737 }
738 
739 
740 static void
fxt1_quantize_MIXED1(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP])741 fxt1_quantize_MIXED1 (uint32_t *cc,
742                       uint8_t input[N_TEXELS][MAX_COMP])
743 {
744    const int32_t n_vect = 2; /* highest vector number in each microtile */
745    const int32_t n_comp = 3; /* 3 components: R, G, B */
746    uint8_t vec[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
747    float b, iv[MAX_COMP]; /* interpolation vector */
748    int32_t i, j, k;
749    Fx64 hi; /* high quadword */
750    uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
751 
752    int32_t minSum;
753    int32_t maxSum;
754    int32_t minColL = 0, maxColL = -1;
755    int32_t minColR = 0, maxColR = -1;
756 
757    /* Our solution here is to find the darkest and brightest colors in
758     * the 4x4 tile and use those as the two representative colors.
759     * There are probably better algorithms to use (histogram-based).
760     */
761    minSum = 2000; /* big enough */
762    maxSum = -1; /* small enough */
763    for (k = 0; k < N_TEXELS / 2; k++) {
764       if (!ISTBLACK(input[k])) {
765          int32_t sum = 0;
766          for (i = 0; i < n_comp; i++) {
767             sum += input[k][i];
768          }
769          if (minSum > sum) {
770             minSum = sum;
771             minColL = k;
772          }
773          if (maxSum < sum) {
774             maxSum = sum;
775             maxColL = k;
776          }
777       }
778    }
779    minSum = 2000; /* big enough */
780    maxSum = -1; /* small enough */
781    for (; k < N_TEXELS; k++) {
782       if (!ISTBLACK(input[k])) {
783          int32_t sum = 0;
784          for (i = 0; i < n_comp; i++) {
785             sum += input[k][i];
786          }
787          if (minSum > sum) {
788             minSum = sum;
789             minColR = k;
790          }
791          if (maxSum < sum) {
792             maxSum = sum;
793             maxColR = k;
794          }
795       }
796    }
797 
798    /* left microtile */
799    if (maxColL == -1) {
800       /* all transparent black */
801       cc[0] = ~0u;
802       for (i = 0; i < n_comp; i++) {
803          vec[0][i] = 0;
804          vec[1][i] = 0;
805       }
806    } else {
807       cc[0] = 0;
808       for (i = 0; i < n_comp; i++) {
809          vec[0][i] = input[minColL][i];
810          vec[1][i] = input[maxColL][i];
811       }
812       if (minColL != maxColL) {
813          /* compute interpolation vector */
814          MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
815 
816          /* add in texels */
817          lolo = 0;
818          for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
819             int32_t texel = n_vect + 1; /* transparent black */
820             if (!ISTBLACK(input[k])) {
821                /* interpolate color */
822                CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
823             }
824             /* add in texel */
825             lolo <<= 2;
826             lolo |= texel;
827          }
828          cc[0] = lolo;
829       }
830    }
831 
832    /* right microtile */
833    if (maxColR == -1) {
834       /* all transparent black */
835       cc[1] = ~0u;
836       for (i = 0; i < n_comp; i++) {
837          vec[2][i] = 0;
838          vec[3][i] = 0;
839       }
840    } else {
841       cc[1] = 0;
842       for (i = 0; i < n_comp; i++) {
843          vec[2][i] = input[minColR][i];
844          vec[3][i] = input[maxColR][i];
845       }
846       if (minColR != maxColR) {
847          /* compute interpolation vector */
848          MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[3]);
849 
850          /* add in texels */
851          lohi = 0;
852          for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
853             int32_t texel = n_vect + 1; /* transparent black */
854             if (!ISTBLACK(input[k])) {
855                /* interpolate color */
856                CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
857             }
858             /* add in texel */
859             lohi <<= 2;
860             lohi |= texel;
861          }
862          cc[1] = lohi;
863       }
864    }
865 
866    FX64_MOV32(hi, 9 | (vec[3][GCOMP] & 4) | ((vec[1][GCOMP] >> 1) & 2)); /* chroma = "1" */
867    for (j = 2 * 2 - 1; j >= 0; j--) {
868       for (i = 0; i < n_comp; i++) {
869          /* add in colors */
870          FX64_SHL(hi, 5);
871          FX64_OR32(hi, vec[j][i] >> 3);
872       }
873    }
874    ((Fx64 *)cc)[1] = hi;
875 }
876 
877 
878 static void
fxt1_quantize_MIXED0(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP])879 fxt1_quantize_MIXED0 (uint32_t *cc,
880                       uint8_t input[N_TEXELS][MAX_COMP])
881 {
882    const int32_t n_vect = 3; /* highest vector number in each microtile */
883    const int32_t n_comp = 3; /* 3 components: R, G, B */
884    uint8_t vec[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
885    float b, iv[MAX_COMP]; /* interpolation vector */
886    int32_t i, j, k;
887    Fx64 hi; /* high quadword */
888    uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
889 
890    int32_t minColL = 0, maxColL = 0;
891    int32_t minColR = 0, maxColR = 0;
892 #if 0
893    int32_t minSum;
894    int32_t maxSum;
895 
896    /* Our solution here is to find the darkest and brightest colors in
897     * the 4x4 tile and use those as the two representative colors.
898     * There are probably better algorithms to use (histogram-based).
899     */
900    minSum = 2000; /* big enough */
901    maxSum = -1; /* small enough */
902    for (k = 0; k < N_TEXELS / 2; k++) {
903       int32_t sum = 0;
904       for (i = 0; i < n_comp; i++) {
905          sum += input[k][i];
906       }
907       if (minSum > sum) {
908          minSum = sum;
909          minColL = k;
910       }
911       if (maxSum < sum) {
912          maxSum = sum;
913          maxColL = k;
914       }
915    }
916    minSum = 2000; /* big enough */
917    maxSum = -1; /* small enough */
918    for (; k < N_TEXELS; k++) {
919       int32_t sum = 0;
920       for (i = 0; i < n_comp; i++) {
921          sum += input[k][i];
922       }
923       if (minSum > sum) {
924          minSum = sum;
925          minColR = k;
926       }
927       if (maxSum < sum) {
928          maxSum = sum;
929          maxColR = k;
930       }
931    }
932 #else
933    int32_t minVal;
934    int32_t maxVal;
935    int32_t maxVarL = fxt1_variance(input, n_comp);
936    int32_t maxVarR = fxt1_variance(&input[N_TEXELS / 2], n_comp);
937 
938    /* Scan the channel with max variance for lo & hi
939     * and use those as the two representative colors.
940     */
941    minVal = 2000; /* big enough */
942    maxVal = -1; /* small enough */
943    for (k = 0; k < N_TEXELS / 2; k++) {
944       int32_t t = input[k][maxVarL];
945       if (minVal > t) {
946          minVal = t;
947          minColL = k;
948       }
949       if (maxVal < t) {
950          maxVal = t;
951          maxColL = k;
952       }
953    }
954    minVal = 2000; /* big enough */
955    maxVal = -1; /* small enough */
956    for (; k < N_TEXELS; k++) {
957       int32_t t = input[k][maxVarR];
958       if (minVal > t) {
959          minVal = t;
960          minColR = k;
961       }
962       if (maxVal < t) {
963          maxVal = t;
964          maxColR = k;
965       }
966    }
967 #endif
968 
969    /* left microtile */
970    cc[0] = 0;
971    for (i = 0; i < n_comp; i++) {
972       vec[0][i] = input[minColL][i];
973       vec[1][i] = input[maxColL][i];
974    }
975    if (minColL != maxColL) {
976       /* compute interpolation vector */
977       MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
978 
979       /* add in texels */
980       lolo = 0;
981       for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
982          int32_t texel;
983          /* interpolate color */
984          CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
985          /* add in texel */
986          lolo <<= 2;
987          lolo |= texel;
988       }
989 
990       /* funky encoding for LSB of green */
991       if ((int32_t)((lolo >> 1) & 1) != (((vec[1][GCOMP] ^ vec[0][GCOMP]) >> 2) & 1)) {
992          for (i = 0; i < n_comp; i++) {
993             vec[1][i] = input[minColL][i];
994             vec[0][i] = input[maxColL][i];
995          }
996          lolo = ~lolo;
997       }
998 
999       cc[0] = lolo;
1000    }
1001 
1002    /* right microtile */
1003    cc[1] = 0;
1004    for (i = 0; i < n_comp; i++) {
1005       vec[2][i] = input[minColR][i];
1006       vec[3][i] = input[maxColR][i];
1007    }
1008    if (minColR != maxColR) {
1009       /* compute interpolation vector */
1010       MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[3]);
1011 
1012       /* add in texels */
1013       lohi = 0;
1014       for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
1015          int32_t texel;
1016          /* interpolate color */
1017          CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
1018          /* add in texel */
1019          lohi <<= 2;
1020          lohi |= texel;
1021       }
1022 
1023       /* funky encoding for LSB of green */
1024       if ((int32_t)((lohi >> 1) & 1) != (((vec[3][GCOMP] ^ vec[2][GCOMP]) >> 2) & 1)) {
1025          for (i = 0; i < n_comp; i++) {
1026             vec[3][i] = input[minColR][i];
1027             vec[2][i] = input[maxColR][i];
1028          }
1029          lohi = ~lohi;
1030       }
1031 
1032       cc[1] = lohi;
1033    }
1034 
1035    FX64_MOV32(hi, 8 | (vec[3][GCOMP] & 4) | ((vec[1][GCOMP] >> 1) & 2)); /* chroma = "1" */
1036    for (j = 2 * 2 - 1; j >= 0; j--) {
1037       for (i = 0; i < n_comp; i++) {
1038          /* add in colors */
1039          FX64_SHL(hi, 5);
1040          FX64_OR32(hi, vec[j][i] >> 3);
1041       }
1042    }
1043    ((Fx64 *)cc)[1] = hi;
1044 }
1045 
1046 
1047 static void
fxt1_quantize(uint32_t * cc,const uint8_t * lines[],int32_t comps)1048 fxt1_quantize (uint32_t *cc, const uint8_t *lines[], int32_t comps)
1049 {
1050    int32_t trualpha;
1051    uint8_t reord[N_TEXELS][MAX_COMP];
1052 
1053    uint8_t input[N_TEXELS][MAX_COMP];
1054    int32_t i, k, l;
1055 
1056    if (comps == 3) {
1057       /* make the whole block opaque */
1058       memset(input, -1, sizeof(input));
1059    }
1060 
1061    /* 8 texels each line */
1062    for (l = 0; l < 4; l++) {
1063       for (k = 0; k < 4; k++) {
1064          for (i = 0; i < comps; i++) {
1065             input[k + l * 4][i] = *lines[l]++;
1066          }
1067       }
1068       for (; k < 8; k++) {
1069          for (i = 0; i < comps; i++) {
1070             input[k + l * 4 + 12][i] = *lines[l]++;
1071          }
1072       }
1073    }
1074 
1075    /* block layout:
1076     * 00, 01, 02, 03, 08, 09, 0a, 0b
1077     * 10, 11, 12, 13, 18, 19, 1a, 1b
1078     * 04, 05, 06, 07, 0c, 0d, 0e, 0f
1079     * 14, 15, 16, 17, 1c, 1d, 1e, 1f
1080     */
1081 
1082    /* [dBorca]
1083     * stupidity flows forth from this
1084     */
1085    l = N_TEXELS;
1086    trualpha = 0;
1087    if (comps == 4) {
1088       /* skip all transparent black texels */
1089       l = 0;
1090       for (k = 0; k < N_TEXELS; k++) {
1091          /* test all components against 0 */
1092          if (!ISTBLACK(input[k])) {
1093             /* texel is not transparent black */
1094             memcpy(reord[l], input[k], 4);
1095             if (reord[l][ACOMP] < (255 - ALPHA_TS)) {
1096                /* non-opaque texel */
1097                trualpha = !0;
1098             }
1099             l++;
1100          }
1101       }
1102    }
1103 
1104 #if 0
1105    if (trualpha) {
1106       fxt1_quantize_ALPHA0(cc, input, reord, l);
1107    } else if (l == 0) {
1108       cc[0] = cc[1] = cc[2] = -1;
1109       cc[3] = 0;
1110    } else if (l < N_TEXELS) {
1111       fxt1_quantize_HI(cc, input, reord, l);
1112    } else {
1113       fxt1_quantize_CHROMA(cc, input);
1114    }
1115    (void)fxt1_quantize_ALPHA1;
1116    (void)fxt1_quantize_MIXED1;
1117    (void)fxt1_quantize_MIXED0;
1118 #else
1119    if (trualpha) {
1120       fxt1_quantize_ALPHA1(cc, input);
1121    } else if (l == 0) {
1122       cc[0] = cc[1] = cc[2] = ~0u;
1123       cc[3] = 0;
1124    } else if (l < N_TEXELS) {
1125       fxt1_quantize_MIXED1(cc, input);
1126    } else {
1127       fxt1_quantize_MIXED0(cc, input);
1128    }
1129    (void)fxt1_quantize_ALPHA0;
1130    (void)fxt1_quantize_HI;
1131    (void)fxt1_quantize_CHROMA;
1132 #endif
1133 }
1134 
1135 
1136 
1137 /**
1138  * Upscale an image by replication, not (typical) stretching.
1139  * We use this when the image width or height is less than a
1140  * certain size (4, 8) and we need to upscale an image.
1141  */
1142 static void
upscale_teximage2d(int32_t inWidth,int32_t inHeight,int32_t outWidth,int32_t outHeight,int32_t comps,const uint8_t * src,int32_t srcRowStride,uint8_t * dest)1143 upscale_teximage2d(int32_t inWidth, int32_t inHeight,
1144                    int32_t outWidth, int32_t outHeight,
1145                    int32_t comps, const uint8_t *src, int32_t srcRowStride,
1146                    uint8_t *dest )
1147 {
1148    int32_t i, j, k;
1149 
1150    assert(outWidth >= inWidth);
1151    assert(outHeight >= inHeight);
1152 #if 0
1153    assert(inWidth == 1 || inWidth == 2 || inHeight == 1 || inHeight == 2);
1154    assert((outWidth & 3) == 0);
1155    assert((outHeight & 3) == 0);
1156 #endif
1157 
1158    for (i = 0; i < outHeight; i++) {
1159       const int32_t ii = i % inHeight;
1160       for (j = 0; j < outWidth; j++) {
1161          const int32_t jj = j % inWidth;
1162          for (k = 0; k < comps; k++) {
1163             dest[(i * outWidth + j) * comps + k]
1164                = src[ii * srcRowStride + jj * comps + k];
1165          }
1166       }
1167    }
1168 }
1169 
1170 
1171 static void
fxt1_encode(uint32_t width,uint32_t height,int32_t comps,const void * source,int32_t srcRowStride,void * dest,int32_t destRowStride)1172 fxt1_encode (uint32_t width, uint32_t height, int32_t comps,
1173              const void *source, int32_t srcRowStride,
1174              void *dest, int32_t destRowStride)
1175 {
1176    uint32_t x, y;
1177    const uint8_t *data;
1178    uint32_t *encoded = (uint32_t *)dest;
1179    void *newSource = NULL;
1180 
1181    assert(comps == 3 || comps == 4);
1182 
1183    /* Replicate image if width is not M8 or height is not M4 */
1184    if ((width & 7) | (height & 3)) {
1185       int32_t newWidth = (width + 7) & ~7;
1186       int32_t newHeight = (height + 3) & ~3;
1187       newSource = malloc(comps * newWidth * newHeight * sizeof(uint8_t));
1188       if (!newSource)
1189          return;
1190       upscale_teximage2d(width, height, newWidth, newHeight,
1191                          comps, (const uint8_t *) source,
1192                          srcRowStride, (uint8_t *) newSource);
1193       source = newSource;
1194       width = newWidth;
1195       height = newHeight;
1196       srcRowStride = comps * newWidth;
1197    }
1198 
1199    data = (const uint8_t *) source;
1200    destRowStride = (destRowStride - width * 2) / 4;
1201    for (y = 0; y < height; y += 4) {
1202       uint32_t offs = 0 + (y + 0) * srcRowStride;
1203       for (x = 0; x < width; x += 8) {
1204          const uint8_t *lines[4];
1205          lines[0] = &data[offs];
1206          lines[1] = lines[0] + srcRowStride;
1207          lines[2] = lines[1] + srcRowStride;
1208          lines[3] = lines[2] + srcRowStride;
1209          offs += 8 * comps;
1210          fxt1_quantize(encoded, lines, comps);
1211          /* 128 bits per 8x4 block */
1212          encoded += 4;
1213       }
1214       encoded += destRowStride;
1215    }
1216 
1217    free(newSource);
1218 }
1219 
1220 
1221 /***************************************************************************\
1222  * FXT1 decoder
1223  *
1224  * The decoder is based on GL_3DFX_texture_compression_FXT1
1225  * specification and serves as a concept for the encoder.
1226 \***************************************************************************/
1227 
1228 
1229 /* lookup table for scaling 5 bit colors up to 8 bits */
1230 static const uint8_t _rgb_scale_5[] = {
1231    0,   8,   16,  25,  33,  41,  49,  58,
1232    66,  74,  82,  90,  99,  107, 115, 123,
1233    132, 140, 148, 156, 165, 173, 181, 189,
1234    197, 206, 214, 222, 230, 239, 247, 255
1235 };
1236 
1237 /* lookup table for scaling 6 bit colors up to 8 bits */
1238 static const uint8_t _rgb_scale_6[] = {
1239    0,   4,   8,   12,  16,  20,  24,  28,
1240    32,  36,  40,  45,  49,  53,  57,  61,
1241    65,  69,  73,  77,  81,  85,  89,  93,
1242    97,  101, 105, 109, 113, 117, 121, 125,
1243    130, 134, 138, 142, 146, 150, 154, 158,
1244    162, 166, 170, 174, 178, 182, 186, 190,
1245    194, 198, 202, 206, 210, 215, 219, 223,
1246    227, 231, 235, 239, 243, 247, 251, 255
1247 };
1248 
1249 
1250 #define CC_SEL(cc, which) (((uint32_t *)(cc))[(which) / 32] >> ((which) & 31))
1251 #define UP5(c) _rgb_scale_5[(c) & 31]
1252 #define UP6(c, b) _rgb_scale_6[(((c) & 31) << 1) | ((b) & 1)]
1253 #define LERP(n, t, c0, c1) (((n) - (t)) * (c0) + (t) * (c1) + (n) / 2) / (n)
1254 
1255 
1256 static void
fxt1_decode_1HI(const uint8_t * code,int32_t t,uint8_t * rgba)1257 fxt1_decode_1HI (const uint8_t *code, int32_t t, uint8_t *rgba)
1258 {
1259    const uint32_t *cc;
1260 
1261    t *= 3;
1262    cc = (const uint32_t *)(code + t / 8);
1263    t = (cc[0] >> (t & 7)) & 7;
1264 
1265    if (t == 7) {
1266       rgba[RCOMP] = rgba[GCOMP] = rgba[BCOMP] = rgba[ACOMP] = 0;
1267    } else {
1268       uint8_t r, g, b;
1269       cc = (const uint32_t *)(code + 12);
1270       if (t == 0) {
1271          b = UP5(CC_SEL(cc, 0));
1272          g = UP5(CC_SEL(cc, 5));
1273          r = UP5(CC_SEL(cc, 10));
1274       } else if (t == 6) {
1275          b = UP5(CC_SEL(cc, 15));
1276          g = UP5(CC_SEL(cc, 20));
1277          r = UP5(CC_SEL(cc, 25));
1278       } else {
1279          b = LERP(6, t, UP5(CC_SEL(cc, 0)), UP5(CC_SEL(cc, 15)));
1280          g = LERP(6, t, UP5(CC_SEL(cc, 5)), UP5(CC_SEL(cc, 20)));
1281          r = LERP(6, t, UP5(CC_SEL(cc, 10)), UP5(CC_SEL(cc, 25)));
1282       }
1283       rgba[RCOMP] = r;
1284       rgba[GCOMP] = g;
1285       rgba[BCOMP] = b;
1286       rgba[ACOMP] = 255;
1287    }
1288 }
1289 
1290 
1291 static void
fxt1_decode_1CHROMA(const uint8_t * code,int32_t t,uint8_t * rgba)1292 fxt1_decode_1CHROMA (const uint8_t *code, int32_t t, uint8_t *rgba)
1293 {
1294    const uint32_t *cc;
1295    uint32_t kk;
1296 
1297    cc = (const uint32_t *)code;
1298    if (t & 16) {
1299       cc++;
1300       t &= 15;
1301    }
1302    t = (cc[0] >> (t * 2)) & 3;
1303 
1304    t *= 15;
1305    cc = (const uint32_t *)(code + 8 + t / 8);
1306    kk = cc[0] >> (t & 7);
1307    rgba[BCOMP] = UP5(kk);
1308    rgba[GCOMP] = UP5(kk >> 5);
1309    rgba[RCOMP] = UP5(kk >> 10);
1310    rgba[ACOMP] = 255;
1311 }
1312 
1313 
1314 static void
fxt1_decode_1MIXED(const uint8_t * code,int32_t t,uint8_t * rgba)1315 fxt1_decode_1MIXED (const uint8_t *code, int32_t t, uint8_t *rgba)
1316 {
1317    const uint32_t *cc;
1318    uint32_t col[2][3];
1319    int32_t glsb, selb;
1320 
1321    cc = (const uint32_t *)code;
1322    if (t & 16) {
1323       t &= 15;
1324       t = (cc[1] >> (t * 2)) & 3;
1325       /* col 2 */
1326       col[0][BCOMP] = (*(const uint32_t *)(code + 11)) >> 6;
1327       col[0][GCOMP] = CC_SEL(cc, 99);
1328       col[0][RCOMP] = CC_SEL(cc, 104);
1329       /* col 3 */
1330       col[1][BCOMP] = CC_SEL(cc, 109);
1331       col[1][GCOMP] = CC_SEL(cc, 114);
1332       col[1][RCOMP] = CC_SEL(cc, 119);
1333       glsb = CC_SEL(cc, 126);
1334       selb = CC_SEL(cc, 33);
1335    } else {
1336       t = (cc[0] >> (t * 2)) & 3;
1337       /* col 0 */
1338       col[0][BCOMP] = CC_SEL(cc, 64);
1339       col[0][GCOMP] = CC_SEL(cc, 69);
1340       col[0][RCOMP] = CC_SEL(cc, 74);
1341       /* col 1 */
1342       col[1][BCOMP] = CC_SEL(cc, 79);
1343       col[1][GCOMP] = CC_SEL(cc, 84);
1344       col[1][RCOMP] = CC_SEL(cc, 89);
1345       glsb = CC_SEL(cc, 125);
1346       selb = CC_SEL(cc, 1);
1347    }
1348 
1349    if (CC_SEL(cc, 124) & 1) {
1350       /* alpha[0] == 1 */
1351 
1352       if (t == 3) {
1353          /* zero */
1354          rgba[RCOMP] = rgba[BCOMP] = rgba[GCOMP] = rgba[ACOMP] = 0;
1355       } else {
1356          uint8_t r, g, b;
1357          if (t == 0) {
1358             b = UP5(col[0][BCOMP]);
1359             g = UP5(col[0][GCOMP]);
1360             r = UP5(col[0][RCOMP]);
1361          } else if (t == 2) {
1362             b = UP5(col[1][BCOMP]);
1363             g = UP6(col[1][GCOMP], glsb);
1364             r = UP5(col[1][RCOMP]);
1365          } else {
1366             b = (UP5(col[0][BCOMP]) + UP5(col[1][BCOMP])) / 2;
1367             g = (UP5(col[0][GCOMP]) + UP6(col[1][GCOMP], glsb)) / 2;
1368             r = (UP5(col[0][RCOMP]) + UP5(col[1][RCOMP])) / 2;
1369          }
1370          rgba[RCOMP] = r;
1371          rgba[GCOMP] = g;
1372          rgba[BCOMP] = b;
1373          rgba[ACOMP] = 255;
1374       }
1375    } else {
1376       /* alpha[0] == 0 */
1377       uint8_t r, g, b;
1378       if (t == 0) {
1379          b = UP5(col[0][BCOMP]);
1380          g = UP6(col[0][GCOMP], glsb ^ selb);
1381          r = UP5(col[0][RCOMP]);
1382       } else if (t == 3) {
1383          b = UP5(col[1][BCOMP]);
1384          g = UP6(col[1][GCOMP], glsb);
1385          r = UP5(col[1][RCOMP]);
1386       } else {
1387          b = LERP(3, t, UP5(col[0][BCOMP]), UP5(col[1][BCOMP]));
1388          g = LERP(3, t, UP6(col[0][GCOMP], glsb ^ selb),
1389                         UP6(col[1][GCOMP], glsb));
1390          r = LERP(3, t, UP5(col[0][RCOMP]), UP5(col[1][RCOMP]));
1391       }
1392       rgba[RCOMP] = r;
1393       rgba[GCOMP] = g;
1394       rgba[BCOMP] = b;
1395       rgba[ACOMP] = 255;
1396    }
1397 }
1398 
1399 
1400 static void
fxt1_decode_1ALPHA(const uint8_t * code,int32_t t,uint8_t * rgba)1401 fxt1_decode_1ALPHA (const uint8_t *code, int32_t t, uint8_t *rgba)
1402 {
1403    const uint32_t *cc;
1404    uint8_t r, g, b, a;
1405 
1406    cc = (const uint32_t *)code;
1407    if (CC_SEL(cc, 124) & 1) {
1408       /* lerp == 1 */
1409       uint32_t col0[4];
1410 
1411       if (t & 16) {
1412          t &= 15;
1413          t = (cc[1] >> (t * 2)) & 3;
1414          /* col 2 */
1415          col0[BCOMP] = (*(const uint32_t *)(code + 11)) >> 6;
1416          col0[GCOMP] = CC_SEL(cc, 99);
1417          col0[RCOMP] = CC_SEL(cc, 104);
1418          col0[ACOMP] = CC_SEL(cc, 119);
1419       } else {
1420          t = (cc[0] >> (t * 2)) & 3;
1421          /* col 0 */
1422          col0[BCOMP] = CC_SEL(cc, 64);
1423          col0[GCOMP] = CC_SEL(cc, 69);
1424          col0[RCOMP] = CC_SEL(cc, 74);
1425          col0[ACOMP] = CC_SEL(cc, 109);
1426       }
1427 
1428       if (t == 0) {
1429          b = UP5(col0[BCOMP]);
1430          g = UP5(col0[GCOMP]);
1431          r = UP5(col0[RCOMP]);
1432          a = UP5(col0[ACOMP]);
1433       } else if (t == 3) {
1434          b = UP5(CC_SEL(cc, 79));
1435          g = UP5(CC_SEL(cc, 84));
1436          r = UP5(CC_SEL(cc, 89));
1437          a = UP5(CC_SEL(cc, 114));
1438       } else {
1439          b = LERP(3, t, UP5(col0[BCOMP]), UP5(CC_SEL(cc, 79)));
1440          g = LERP(3, t, UP5(col0[GCOMP]), UP5(CC_SEL(cc, 84)));
1441          r = LERP(3, t, UP5(col0[RCOMP]), UP5(CC_SEL(cc, 89)));
1442          a = LERP(3, t, UP5(col0[ACOMP]), UP5(CC_SEL(cc, 114)));
1443       }
1444    } else {
1445       /* lerp == 0 */
1446 
1447       if (t & 16) {
1448          cc++;
1449          t &= 15;
1450       }
1451       t = (cc[0] >> (t * 2)) & 3;
1452 
1453       if (t == 3) {
1454          /* zero */
1455          r = g = b = a = 0;
1456       } else {
1457          uint32_t kk;
1458          cc = (const uint32_t *)code;
1459          a = UP5(cc[3] >> (t * 5 + 13));
1460          t *= 15;
1461          cc = (const uint32_t *)(code + 8 + t / 8);
1462          kk = cc[0] >> (t & 7);
1463          b = UP5(kk);
1464          g = UP5(kk >> 5);
1465          r = UP5(kk >> 10);
1466       }
1467    }
1468    rgba[RCOMP] = r;
1469    rgba[GCOMP] = g;
1470    rgba[BCOMP] = b;
1471    rgba[ACOMP] = a;
1472 }
1473 
1474 
1475 static void
fxt1_decode_1(const void * texture,int32_t stride,int32_t i,int32_t j,uint8_t * rgba)1476 fxt1_decode_1 (const void *texture, int32_t stride, /* in pixels */
1477                int32_t i, int32_t j, uint8_t *rgba)
1478 {
1479    static void (*decode_1[]) (const uint8_t *, int32_t, uint8_t *) = {
1480       fxt1_decode_1HI,     /* cc-high   = "00?" */
1481       fxt1_decode_1HI,     /* cc-high   = "00?" */
1482       fxt1_decode_1CHROMA, /* cc-chroma = "010" */
1483       fxt1_decode_1ALPHA,  /* alpha     = "011" */
1484       fxt1_decode_1MIXED,  /* mixed     = "1??" */
1485       fxt1_decode_1MIXED,  /* mixed     = "1??" */
1486       fxt1_decode_1MIXED,  /* mixed     = "1??" */
1487       fxt1_decode_1MIXED   /* mixed     = "1??" */
1488    };
1489 
1490    const uint8_t *code = (const uint8_t *)texture +
1491                          ((j / 4) * (stride / 8) + (i / 8)) * 16;
1492    int32_t mode = CC_SEL(code, 125);
1493    int32_t t = i & 7;
1494 
1495    if (t & 4) {
1496       t += 12;
1497    }
1498    t += (j & 3) * 4;
1499 
1500    decode_1[mode](code, t, rgba);
1501 }
1502 
1503 /*
1504  * Pixel fetch within a block.
1505  */
1506 
1507 void
util_format_fxt1_rgb_fetch_rgba_8unorm(uint8_t * restrict dst,const uint8_t * restrict src,unsigned i,unsigned j)1508 util_format_fxt1_rgb_fetch_rgba_8unorm(uint8_t *restrict dst, const uint8_t *restrict src, unsigned i, unsigned j)
1509 {
1510    fxt1_decode_1(src, 0, i, j, dst);
1511 }
1512 
1513 void
util_format_fxt1_rgba_fetch_rgba_8unorm(uint8_t * restrict dst,const uint8_t * restrict src,unsigned i,unsigned j)1514 util_format_fxt1_rgba_fetch_rgba_8unorm(uint8_t *restrict dst, const uint8_t *restrict src, unsigned i, unsigned j)
1515 {
1516    fxt1_decode_1(src, 0, i, j, dst);
1517    dst[3] = 0xff;
1518 }
1519 
1520 void
util_format_fxt1_rgb_fetch_rgba(void * restrict in_dst,const uint8_t * restrict src,unsigned i,unsigned j)1521 util_format_fxt1_rgb_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, unsigned i, unsigned j)
1522 {
1523    float *dst = in_dst;
1524    uint8_t tmp[4];
1525    fxt1_decode_1(src, 0, i, j, tmp);
1526    dst[0] = ubyte_to_float(tmp[0]);
1527    dst[1] = ubyte_to_float(tmp[1]);
1528    dst[2] = ubyte_to_float(tmp[2]);
1529    dst[3] = 1.0;
1530 }
1531 
1532 void
util_format_fxt1_rgba_fetch_rgba(void * restrict in_dst,const uint8_t * restrict src,unsigned i,unsigned j)1533 util_format_fxt1_rgba_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, unsigned i, unsigned j)
1534 {
1535    float *dst = in_dst;
1536    uint8_t tmp[4];
1537    fxt1_decode_1(src, 0, i, j, tmp);
1538    dst[0] = ubyte_to_float(tmp[0]);
1539    dst[1] = ubyte_to_float(tmp[1]);
1540    dst[2] = ubyte_to_float(tmp[2]);
1541    dst[3] = ubyte_to_float(tmp[3]);
1542 }
1543 
1544 /*
1545  * Block decompression.
1546  */
1547 
1548 static inline void
util_format_fxtn_rgb_unpack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height,boolean rgba)1549 util_format_fxtn_rgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1550                                         const uint8_t *restrict src_row, unsigned src_stride,
1551                                         unsigned width, unsigned height,
1552                                         boolean rgba)
1553 {
1554    const unsigned bw = 8, bh = 4, comps = 4;
1555    unsigned x, y, i, j;
1556    for (y = 0; y < height; y += bh) {
1557       const uint8_t *src = src_row;
1558       for (x = 0; x < width; x += bw) {
1559          for (j = 0; j < bh; ++j) {
1560             for (i = 0; i < bw; ++i) {
1561                uint8_t *dst = dst_row + (y + j) * dst_stride / sizeof(*dst_row) + (x + i) * comps;
1562                fxt1_decode_1(src, 0, i, j, dst);
1563                if (!rgba)
1564                   dst[3] = 0xff;
1565             }
1566          }
1567          src += FXT1_BLOCK_SIZE;
1568       }
1569       src_row += src_stride;
1570    }
1571 }
1572 
1573 void
util_format_fxt1_rgb_unpack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height)1574 util_format_fxt1_rgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1575                                         const uint8_t *restrict src_row, unsigned src_stride,
1576                                         unsigned width, unsigned height)
1577 {
1578    util_format_fxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride,
1579                                            src_row, src_stride,
1580                                            width, height,
1581                                            false);
1582 }
1583 
1584 void
util_format_fxt1_rgba_unpack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height)1585 util_format_fxt1_rgba_unpack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1586                                          const uint8_t *restrict src_row, unsigned src_stride,
1587                                          unsigned width, unsigned height)
1588 {
1589    util_format_fxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride,
1590                                            src_row, src_stride,
1591                                            width, height,
1592                                            true);
1593 }
1594 
1595 static inline void
util_format_fxtn_rgb_unpack_rgba_float(float * dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height,boolean rgba)1596 util_format_fxtn_rgb_unpack_rgba_float(float *dst_row, unsigned dst_stride,
1597                                        const uint8_t *restrict src_row, unsigned src_stride,
1598                                        unsigned width, unsigned height,
1599                                        boolean rgba)
1600 {
1601    const unsigned bw = 8, bh = 4, comps = 4;
1602    unsigned x, y, i, j;
1603    for (y = 0; y < height; y += 4) {
1604       const uint8_t *src = src_row;
1605       for (x = 0; x < width; x += 8) {
1606          for (j = 0; j < bh; ++j) {
1607             for (i = 0; i < bw; ++i) {
1608                float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i) * comps;
1609                uint8_t tmp[4];
1610                fxt1_decode_1(src, 0, i, j, tmp);
1611                dst[0] = ubyte_to_float(tmp[0]);
1612                dst[1] = ubyte_to_float(tmp[1]);
1613                dst[2] = ubyte_to_float(tmp[2]);
1614                if (rgba)
1615                   dst[3] = ubyte_to_float(tmp[3]);
1616                else
1617                   dst[3] = 1.0;
1618             }
1619          }
1620          src += FXT1_BLOCK_SIZE;
1621       }
1622       src_row += src_stride;
1623    }
1624 }
1625 
1626 void
util_format_fxt1_rgb_unpack_rgba_float(void * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height)1627 util_format_fxt1_rgb_unpack_rgba_float(void *restrict dst_row, unsigned dst_stride,
1628                                        const uint8_t *restrict src_row, unsigned src_stride,
1629                                        unsigned width, unsigned height)
1630 {
1631    util_format_fxtn_rgb_unpack_rgba_float(dst_row, dst_stride,
1632                                           src_row, src_stride,
1633                                           width, height,
1634                                           false);
1635 }
1636 
1637 void
util_format_fxt1_rgba_unpack_rgba_float(void * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height)1638 util_format_fxt1_rgba_unpack_rgba_float(void *restrict dst_row, unsigned dst_stride,
1639                                         const uint8_t *restrict src_row, unsigned src_stride,
1640                                         unsigned width, unsigned height)
1641 {
1642    util_format_fxtn_rgb_unpack_rgba_float(dst_row, dst_stride,
1643                                           src_row, src_stride,
1644                                           width, height,
1645                                           true);
1646 }
1647 
1648 /*
1649  * Block compression.
1650  */
1651 
1652 void
util_format_fxt1_rgb_pack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src,unsigned src_stride,unsigned width,unsigned height)1653 util_format_fxt1_rgb_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1654                                       const uint8_t *restrict src, unsigned src_stride,
1655                                       unsigned width, unsigned height)
1656 {
1657    /* The encoder for FXT1_RGB wants 24bpp packed rgb, so make a temporary to do that.
1658     */
1659    int temp_stride = width * 3;
1660    uint8_t *temp = malloc(height * temp_stride);
1661    if (!temp)
1662       return;
1663 
1664    for (int y = 0; y < height; y++) {
1665       for (int x = 0; x < width; x++) {
1666          temp[y * temp_stride + x * 3 + 0] = src[x * 4 + 0];
1667          temp[y * temp_stride + x * 3 + 1] = src[x * 4 + 1];
1668          temp[y * temp_stride + x * 3 + 2] = src[x * 4 + 2];
1669       }
1670       src += src_stride;
1671    }
1672 
1673    fxt1_encode(width, height, 3, temp, temp_stride, dst_row, dst_stride);
1674 
1675    free(temp);
1676 }
1677 
1678 void
util_format_fxt1_rgba_pack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src,unsigned src_stride,unsigned width,unsigned height)1679 util_format_fxt1_rgba_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1680                                        const uint8_t *restrict src, unsigned src_stride,
1681                                        unsigned width, unsigned height)
1682 {
1683    fxt1_encode(width, height, 4, src, src_stride, dst_row, dst_stride);
1684 }
1685 
1686 void
util_format_fxt1_rgb_pack_rgba_float(uint8_t * restrict dst_row,unsigned dst_stride,const float * restrict src,unsigned src_stride,unsigned width,unsigned height)1687 util_format_fxt1_rgb_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride,
1688                                      const float *restrict src, unsigned src_stride,
1689                                      unsigned width, unsigned height)
1690 {
1691    int temp_stride = width * 4;
1692    uint8_t *temp = malloc(height * temp_stride);
1693    if (!temp)
1694       return;
1695 
1696    util_format_r8g8b8a8_unorm_pack_rgba_float(temp, temp_stride,
1697                                               src, src_stride,
1698                                               width, height);
1699 
1700    util_format_fxt1_rgb_pack_rgba_8unorm(dst_row, dst_stride,
1701                                          temp, temp_stride,
1702                                          width, height);
1703 
1704    free(temp);
1705 }
1706 
1707 void
util_format_fxt1_rgba_pack_rgba_float(uint8_t * restrict dst_row,unsigned dst_stride,const float * restrict src,unsigned src_stride,unsigned width,unsigned height)1708 util_format_fxt1_rgba_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride,
1709                                       const float *restrict src, unsigned src_stride,
1710                                       unsigned width, unsigned height)
1711 {
1712    int temp_stride = width * 4;
1713    uint8_t *temp = malloc(height * temp_stride);
1714    if (!temp)
1715       return;
1716 
1717    util_format_r8g8b8a8_unorm_pack_rgba_float(temp, temp_stride,
1718                                               src, src_stride,
1719                                               width, height);
1720 
1721    util_format_fxt1_rgba_pack_rgba_8unorm(dst_row, dst_stride,
1722                                           temp, temp_stride,
1723                                           width, height);
1724 
1725    free(temp);
1726 }
1727