1 /*
2 Copyright 2007 nVidia, Inc.
3 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
4 
5 You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
6 
7 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
8 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 
10 See the License for the specific language governing permissions and limitations under the License.
11 */
12 
13 // one region zoh compress/decompress code
14 // Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
15 
16 #include "bits.h"
17 #include "tile.h"
18 #include "zoh.h"
19 #include "zoh_utils.h"
20 
21 #include "nvmath/vector.inl"
22 #include "nvmath/fitting.h"
23 
24 #include <string.h> // strlen
25 #include <float.h> // FLT_MAX
26 
27 using namespace nv;
28 using namespace ZOH;
29 
30 #define NINDICES	16
31 #define	INDEXBITS	4
32 #define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
33 #define	DENOM		(NINDICES-1)
34 
35 #define	NSHAPES	1
36 
37 static const int shapes[NSHAPES] =
38 {
39     0x0000
40 };	// only 1 shape
41 
42 #define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
43 
44 #define	POS_TO_X(pos)	((pos)&3)
45 #define	POS_TO_Y(pos)	(((pos)>>2)&3)
46 
47 #define	NDELTA	2
48 
49 struct Chanpat
50 {
51     int prec[NDELTA];		// precision pattern for one channel
52 };
53 
54 struct Pattern
55 {
56     Chanpat chan[NCHANNELS];// allow different bit patterns per channel -- but we still want constant precision per channel
57     int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
58     int mode;				// associated mode value
59     int modebits;			// number of mode bits
60     const char *encoding;	// verilog description of encoding for this mode
61 };
62 
63 #define MAXMODEBITS	5
64 #define	MAXMODES (1<<MAXMODEBITS)
65 
66 #define	NPATTERNS 4
67 
68 static const Pattern patterns[NPATTERNS] =
69 {
70     16,4,  16,4,  16,4,   1, 0x0f, 5, "bw[10],bw[11],bw[12],bw[13],bw[14],bw[15],bx[3:0],gw[10],gw[11],gw[12],gw[13],gw[14],gw[15],gx[3:0],rw[10],rw[11],rw[12],rw[13],rw[14],rw[15],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
71     12,8,  12,8,  12,8,   1, 0x0b, 5, "bw[10],bw[11],bx[7:0],gw[10],gw[11],gx[7:0],rw[10],rw[11],rx[7:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
72     11,9,  11,9,  11,9,   1, 0x07, 5, "bw[10],bx[8:0],gw[10],gx[8:0],rw[10],rx[8:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
73     10,10, 10,10, 10,10,  0, 0x03, 5, "bx[9:0],gx[9:0],rx[9:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
74 };
75 
76 // mapping of mode to the corresponding index in pattern
77 static const int mode_to_pat[MAXMODES] = {
78     -1,-1,-1,
79     3,	// 0x03
80     -1,-1,-1,
81     2,	// 0x07
82     -1,-1,-1,
83     1,	// 0x0b
84     -1,-1,-1,
85     0,	// 0x0f
86     -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
87 };
88 
89 #define	R_0(ep)	(ep)[0].A[i]
90 #define	R_1(ep)	(ep)[0].B[i]
91 #define	MASK(n)	((1<<(n))-1)
92 
93 // compress endpoints
compress_endpts(const IntEndpts in[NREGIONS_ONE],ComprEndpts out[NREGIONS_ONE],const Pattern & p)94 static void compress_endpts(const IntEndpts in[NREGIONS_ONE], ComprEndpts out[NREGIONS_ONE], const Pattern &p)
95 {
96     if (p.transformed)
97     {
98         for (int i=0; i<NCHANNELS; ++i)
99         {
100             R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
101             R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
102         }
103     }
104     else
105     {
106         for (int i=0; i<NCHANNELS; ++i)
107         {
108             R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
109             R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
110         }
111     }
112 }
113 
114 // decompress endpoints
decompress_endpts(const ComprEndpts in[NREGIONS_ONE],IntEndpts out[NREGIONS_ONE],const Pattern & p)115 static void decompress_endpts(const ComprEndpts in[NREGIONS_ONE], IntEndpts out[NREGIONS_ONE], const Pattern &p)
116 {
117     bool issigned = Utils::FORMAT == SIGNED_F16;
118 
119     if (p.transformed)
120     {
121         for (int i=0; i<NCHANNELS; ++i)
122         {
123             R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
124             int t;
125             t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
126             t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
127             R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
128         }
129     }
130     else
131     {
132         for (int i=0; i<NCHANNELS; ++i)
133         {
134             R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
135             R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
136         }
137     }
138 }
139 
quantize_endpts(const FltEndpts endpts[NREGIONS_ONE],int prec,IntEndpts q_endpts[NREGIONS_ONE])140 static void quantize_endpts(const FltEndpts endpts[NREGIONS_ONE], int prec, IntEndpts q_endpts[NREGIONS_ONE])
141 {
142     for (int region = 0; region < NREGIONS_ONE; ++region)
143     {
144         q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, prec);
145         q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, prec);
146         q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, prec);
147         q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, prec);
148         q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, prec);
149         q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, prec);
150     }
151 }
152 
153 // swap endpoints as needed to ensure that the indices at index_one and index_one have a 0 high-order bit
154 // index_one is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
swap_indices(IntEndpts endpts[NREGIONS_ONE],int indices[Tile::TILE_H][Tile::TILE_W],int shapeindex)155 static void swap_indices(IntEndpts endpts[NREGIONS_ONE], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
156 {
157     int index_positions[NREGIONS_ONE];
158 
159     index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
160 
161     for (int region = 0; region < NREGIONS_ONE; ++region)
162     {
163         int x = index_positions[region] & 3;
164         int y = (index_positions[region] >> 2) & 3;
165         nvDebugCheck(REGION(x,y,shapeindex) == region);		// double check the table
166         if (indices[y][x] & HIGH_INDEXBIT)
167         {
168             // high bit is set, swap the endpts and indices for this region
169             int t;
170             for (int i=0; i<NCHANNELS; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
171 
172             for (int y = 0; y < Tile::TILE_H; y++)
173                 for (int x = 0; x < Tile::TILE_W; x++)
174                     if (REGION(x,y,shapeindex) == region)
175                         indices[y][x] = NINDICES - 1 - indices[y][x];
176         }
177     }
178 }
179 
180 // endpoints fit only if the compression was lossless
endpts_fit(const IntEndpts orig[NREGIONS_ONE],const ComprEndpts compressed[NREGIONS_ONE],const Pattern & p)181 static bool endpts_fit(const IntEndpts orig[NREGIONS_ONE], const ComprEndpts compressed[NREGIONS_ONE], const Pattern &p)
182 {
183     IntEndpts uncompressed[NREGIONS_ONE];
184 
185     decompress_endpts(compressed, uncompressed, p);
186 
187     for (int j=0; j<NREGIONS_ONE; ++j)
188 	for (int i=0; i<NCHANNELS; ++i)
189 	{
190         if (orig[j].A[i] != uncompressed[j].A[i]) return false;
191         if (orig[j].B[i] != uncompressed[j].B[i]) return false;
192     }
193     return true;
194 }
195 
write_header(const ComprEndpts endpts[NREGIONS_ONE],const Pattern & p,Bits & out)196 static void write_header(const ComprEndpts endpts[NREGIONS_ONE], const Pattern &p, Bits &out)
197 {
198     // interpret the verilog backwards and process it
199     int m = p.mode;
200     int rw = endpts[0].A[0], rx = endpts[0].B[0];
201     int gw = endpts[0].A[1], gx = endpts[0].B[1];
202     int bw = endpts[0].A[2], bx = endpts[0].B[2];
203     int ptr = int(strlen(p.encoding));
204     while (ptr)
205     {
206         Field field;
207         int endbit, len;
208 
209 		// !!!UNDONE: get rid of string parsing!!!
210         Utils::parse(p.encoding, ptr, field, endbit, len);
211         switch(field)
212         {
213         case FIELD_M:	out.write( m >> endbit, len); break;
214         case FIELD_RW:	out.write(rw >> endbit, len); break;
215         case FIELD_RX:	out.write(rx >> endbit, len); break;
216         case FIELD_GW:	out.write(gw >> endbit, len); break;
217         case FIELD_GX:	out.write(gx >> endbit, len); break;
218         case FIELD_BW:	out.write(bw >> endbit, len); break;
219         case FIELD_BX:	out.write(bx >> endbit, len); break;
220 
221         case FIELD_D:
222         case FIELD_RY:
223         case FIELD_RZ:
224         case FIELD_GY:
225         case FIELD_GZ:
226         case FIELD_BY:
227         case FIELD_BZ:
228         default: nvUnreachable();
229         }
230     }
231 }
232 
read_header(Bits & in,ComprEndpts endpts[NREGIONS_ONE],Pattern & p)233 static void read_header(Bits &in, ComprEndpts endpts[NREGIONS_ONE], Pattern &p)
234 {
235     // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
236     int mode = in.read(2);
237     if (mode != 0x00 && mode != 0x01)
238         mode = (in.read(3) << 2) | mode;
239 
240     int pat_index = mode_to_pat[mode];
241 
242     nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS);
243     nvDebugCheck (in.getptr() == patterns[pat_index].modebits);
244 
245     p = patterns[pat_index];
246 
247     int d;
248     int rw, rx;
249     int gw, gx;
250     int bw, bx;
251 
252     d = 0;
253     rw = rx = 0;
254     gw = gx = 0;
255     bw = bx = 0;
256 
257     int ptr = int(strlen(p.encoding));
258 
259     while (ptr)
260     {
261         Field field;
262         int endbit, len;
263 
264 		// !!!UNDONE: get rid of string parsing!!!
265         Utils::parse(p.encoding, ptr, field, endbit, len);
266 
267         switch(field)
268         {
269         case FIELD_M:	break;	// already processed so ignore
270         case FIELD_RW:	rw |= in.read(len) << endbit; break;
271         case FIELD_RX:	rx |= in.read(len) << endbit; break;
272         case FIELD_GW:	gw |= in.read(len) << endbit; break;
273         case FIELD_GX:	gx |= in.read(len) << endbit; break;
274         case FIELD_BW:	bw |= in.read(len) << endbit; break;
275         case FIELD_BX:	bx |= in.read(len) << endbit; break;
276 
277         case FIELD_D:
278         case FIELD_RY:
279         case FIELD_RZ:
280         case FIELD_GY:
281         case FIELD_GZ:
282         case FIELD_BY:
283         case FIELD_BZ:
284         default: nvUnreachable();
285         }
286     }
287 
288     nvDebugCheck (in.getptr() == 128 - 63);
289 
290     endpts[0].A[0] = rw; endpts[0].B[0] = rx;
291     endpts[0].A[1] = gw; endpts[0].B[1] = gx;
292     endpts[0].A[2] = bw; endpts[0].B[2] = bx;
293 }
294 
295 // compress index 0
write_indices(const int indices[Tile::TILE_H][Tile::TILE_W],int shapeindex,Bits & out)296 static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
297 {
298     for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
299     {
300         int x = POS_TO_X(pos);
301         int y = POS_TO_Y(pos);
302 
303         out.write(indices[y][x], INDEXBITS - ((pos == 0) ? 1 : 0));
304     }
305 }
306 
emit_block(const ComprEndpts endpts[NREGIONS_ONE],int shapeindex,const Pattern & p,const int indices[Tile::TILE_H][Tile::TILE_W],char * block)307 static void emit_block(const ComprEndpts endpts[NREGIONS_ONE], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
308 {
309     Bits out(block, ZOH::BITSIZE);
310 
311     write_header(endpts, p, out);
312 
313     write_indices(indices, shapeindex, out);
314 
315     nvDebugCheck(out.getptr() == ZOH::BITSIZE);
316 }
317 
generate_palette_quantized(const IntEndpts & endpts,int prec,Vector3 palette[NINDICES])318 static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES])
319 {
320     // scale endpoints
321     int a, b;			// really need a IntVector3...
322 
323     a = Utils::unquantize(endpts.A[0], prec);
324     b = Utils::unquantize(endpts.B[0], prec);
325 
326     // interpolate
327     for (int i = 0; i < NINDICES; ++i)
328         palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
329 
330     a = Utils::unquantize(endpts.A[1], prec);
331     b = Utils::unquantize(endpts.B[1], prec);
332 
333     // interpolate
334     for (int i = 0; i < NINDICES; ++i)
335         palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
336 
337     a = Utils::unquantize(endpts.A[2], prec);
338     b = Utils::unquantize(endpts.B[2], prec);
339 
340     // interpolate
341     for (int i = 0; i < NINDICES; ++i)
342         palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
343 }
344 
345 // position 0 was compressed
read_indices(Bits & in,int shapeindex,int indices[Tile::TILE_H][Tile::TILE_W])346 static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
347 {
348     for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
349     {
350         int x = POS_TO_X(pos);
351         int y = POS_TO_Y(pos);
352 
353         indices[y][x]= in.read(INDEXBITS - ((pos == 0) ? 1 : 0));
354     }
355 }
356 
decompressone(const char * block,Tile & t)357 void ZOH::decompressone(const char *block, Tile &t)
358 {
359     Bits in(block, ZOH::BITSIZE);
360 
361     Pattern p;
362     IntEndpts endpts[NREGIONS_ONE];
363     ComprEndpts compr_endpts[NREGIONS_ONE];
364 
365     read_header(in, compr_endpts, p);
366     int shapeindex = 0;		// only one shape
367 
368     decompress_endpts(compr_endpts, endpts, p);
369 
370     Vector3 palette[NREGIONS_ONE][NINDICES];
371     for (int r = 0; r < NREGIONS_ONE; ++r)
372         generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
373 
374     // read indices
375     int indices[Tile::TILE_H][Tile::TILE_W];
376 
377     read_indices(in, shapeindex, indices);
378 
379     nvDebugCheck(in.getptr() == ZOH::BITSIZE);
380 
381     // lookup
382     for (int y = 0; y < Tile::TILE_H; y++)
383 	for (int x = 0; x < Tile::TILE_W; x++)
384             t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
385 }
386 
387 // given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
map_colors(const Vector3 colors[],const float importance[],int np,const IntEndpts & endpts,int prec)388 static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
389 {
390     Vector3 palette[NINDICES];
391     float toterr = 0;
392     Vector3 err;
393 
394     generate_palette_quantized(endpts, prec, palette);
395 
396     for (int i = 0; i < np; ++i)
397     {
398         float err, besterr;
399 
400         besterr = Utils::norm(colors[i], palette[0]) * importance[i];
401 
402         for (int j = 1; j < NINDICES && besterr > 0; ++j)
403         {
404             err = Utils::norm(colors[i], palette[j]) * importance[i];
405 
406             if (err > besterr)	// error increased, so we're done searching
407                 break;
408             if (err < besterr)
409                 besterr = err;
410         }
411         toterr += besterr;
412     }
413     return toterr;
414 }
415 
416 // assign indices given a tile, shape, and quantized endpoints, return toterr for each region
assign_indices(const Tile & tile,int shapeindex,IntEndpts endpts[NREGIONS_ONE],int prec,int indices[Tile::TILE_H][Tile::TILE_W],float toterr[NREGIONS_ONE])417 static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_ONE], int prec,
418                            int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_ONE])
419 {
420     // build list of possibles
421     Vector3 palette[NREGIONS_ONE][NINDICES];
422 
423     for (int region = 0; region < NREGIONS_ONE; ++region)
424     {
425         generate_palette_quantized(endpts[region], prec, &palette[region][0]);
426         toterr[region] = 0;
427     }
428 
429     Vector3 err;
430 
431     for (int y = 0; y < tile.size_y; y++)
432 	for (int x = 0; x < tile.size_x; x++)
433 	{
434         int region = REGION(x,y,shapeindex);
435         float err, besterr;
436 
437         besterr = Utils::norm(tile.data[y][x], palette[region][0]);
438         indices[y][x] = 0;
439 
440         for (int i = 1; i < NINDICES && besterr > 0; ++i)
441         {
442             err = Utils::norm(tile.data[y][x], palette[region][i]);
443 
444             if (err > besterr)	// error increased, so we're done searching
445                 break;
446             if (err < besterr)
447             {
448                 besterr = err;
449                 indices[y][x] = i;
450             }
451         }
452         toterr[region] += besterr;
453     }
454 }
455 
perturb_one(const Vector3 colors[],const float importance[],int np,int ch,int prec,const IntEndpts & old_endpts,IntEndpts & new_endpts,float old_err,int do_b)456 static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts,
457                           float old_err, int do_b)
458 {
459     // we have the old endpoints: old_endpts
460     // we have the perturbed endpoints: new_endpts
461     // we have the temporary endpoints: temp_endpts
462 
463     IntEndpts temp_endpts;
464     float min_err = old_err;		// start with the best current error
465     int beststep;
466 
467     // copy real endpoints so we can perturb them
468     for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
469 
470     // do a logarithmic search for the best error for this endpoint (which)
471     for (int step = 1 << (prec-1); step; step >>= 1)
472     {
473         bool improved = false;
474         for (int sign = -1; sign <= 1; sign += 2)
475         {
476             if (do_b == 0)
477             {
478                 temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
479                 if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
480                     continue;
481             }
482             else
483             {
484                 temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
485                 if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
486                     continue;
487             }
488 
489             float err = map_colors(colors, importance, np, temp_endpts, prec);
490 
491             if (err < min_err)
492             {
493                 improved = true;
494                 min_err = err;
495                 beststep = sign * step;
496             }
497         }
498         // if this was an improvement, move the endpoint and continue search from there
499         if (improved)
500         {
501             if (do_b == 0)
502                 new_endpts.A[ch] += beststep;
503             else
504                 new_endpts.B[ch] += beststep;
505         }
506     }
507     return min_err;
508 }
509 
optimize_one(const Vector3 colors[],const float importance[],int np,float orig_err,const IntEndpts & orig_endpts,int prec,IntEndpts & opt_endpts)510 static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
511 {
512     float opt_err = orig_err;
513     for (int ch = 0; ch < NCHANNELS; ++ch)
514     {
515         opt_endpts.A[ch] = orig_endpts.A[ch];
516         opt_endpts.B[ch] = orig_endpts.B[ch];
517     }
518     /*
519         err0 = perturb(rgb0, delta0)
520         err1 = perturb(rgb1, delta1)
521         if (err0 < err1)
522             if (err0 >= initial_error) break
523             rgb0 += delta0
524             next = 1
525         else
526             if (err1 >= initial_error) break
527             rgb1 += delta1
528             next = 0
529         initial_err = map()
530         for (;;)
531             err = perturb(next ? rgb1:rgb0, delta)
532             if (err >= initial_err) break
533             next? rgb1 : rgb0 += delta
534             initial_err = err
535 	*/
536     IntEndpts new_a, new_b;
537     IntEndpts new_endpt;
538     int do_b;
539 
540     // now optimize each channel separately
541     for (int ch = 0; ch < NCHANNELS; ++ch)
542     {
543         // figure out which endpoint when perturbed gives the most improvement and start there
544         // if we just alternate, we can easily end up in a local minima
545         float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0);	// perturb endpt A
546         float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1);	// perturb endpt B
547 
548         if (err0 < err1)
549         {
550             if (err0 >= opt_err)
551                 continue;
552 
553             opt_endpts.A[ch] = new_a.A[ch];
554             opt_err = err0;
555             do_b = 1;		// do B next
556         }
557         else
558         {
559             if (err1 >= opt_err)
560                 continue;
561             opt_endpts.B[ch] = new_b.B[ch];
562             opt_err = err1;
563             do_b = 0;		// do A next
564         }
565 
566         // now alternate endpoints and keep trying until there is no improvement
567         for (;;)
568         {
569             float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
570             if (err >= opt_err)
571                 break;
572             if (do_b == 0)
573                 opt_endpts.A[ch] = new_endpt.A[ch];
574             else
575                 opt_endpts.B[ch] = new_endpt.B[ch];
576             opt_err = err;
577             do_b = 1 - do_b;	// now move the other endpoint
578         }
579     }
580 }
581 
optimize_endpts(const Tile & tile,int shapeindex,const float orig_err[NREGIONS_ONE],const IntEndpts orig_endpts[NREGIONS_ONE],int prec,IntEndpts opt_endpts[NREGIONS_ONE])582 static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_ONE],
583                             const IntEndpts orig_endpts[NREGIONS_ONE], int prec, IntEndpts opt_endpts[NREGIONS_ONE])
584 {
585     Vector3 pixels[Tile::TILE_TOTAL];
586     float importance[Tile::TILE_TOTAL];
587     float err = 0;
588 
589     for (int region=0; region<NREGIONS_ONE; ++region)
590     {
591         // collect the pixels in the region
592         int np = 0;
593 
594         for (int y = 0; y < tile.size_y; y++) {
595             for (int x = 0; x < tile.size_x; x++) {
596                 if (REGION(x, y, shapeindex) == region) {
597                     pixels[np] = tile.data[y][x];
598                     importance[np] = tile.importance_map[y][x];
599                     ++np;
600                 }
601             }
602         }
603 
604         optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
605     }
606 }
607 
608 /* optimization algorithm
609     for each pattern
610         convert endpoints using pattern precision
611         assign indices and get initial error
612         compress indices (and possibly reorder endpoints)
613         transform endpoints
614         if transformed endpoints fit pattern
615             get original endpoints back
616             optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
617             compress new indices
618             transform new endpoints
619             if new endpoints fit pattern AND if error is improved
620                 emit compressed block with new data
621             else
622                 emit compressed block with original data // to try to preserve maximum endpoint precision
623 */
624 
refineone(const Tile & tile,int shapeindex_best,const FltEndpts endpts[NREGIONS_ONE],char * block)625 float ZOH::refineone(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_ONE], char *block)
626 {
627     float orig_err[NREGIONS_ONE], opt_err[NREGIONS_ONE], orig_toterr, opt_toterr;
628     IntEndpts orig_endpts[NREGIONS_ONE], opt_endpts[NREGIONS_ONE];
629     ComprEndpts compr_orig[NREGIONS_ONE], compr_opt[NREGIONS_ONE];
630     int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
631 
632     for (int sp = 0; sp < NPATTERNS; ++sp)
633     {
634         // precisions for all channels need to be the same
635         for (int i=1; i<NCHANNELS; ++i) nvDebugCheck (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
636 
637         quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
638         assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
639         swap_indices(orig_endpts, orig_indices, shapeindex_best);
640         compress_endpts(orig_endpts, compr_orig, patterns[sp]);
641         if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
642         {
643             optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
644             assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
645             swap_indices(opt_endpts, opt_indices, shapeindex_best);
646             compress_endpts(opt_endpts, compr_opt, patterns[sp]);
647             orig_toterr = opt_toterr = 0;
648             for (int i=0; i < NREGIONS_ONE; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
649 
650             if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
651             {
652                 emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
653                 return opt_toterr;
654             }
655             else
656             {
657                 // either it stopped fitting when we optimized it, or there was no improvement
658                 // so go back to the unoptimized endpoints which we know will fit
659                 emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
660                 return orig_toterr;
661             }
662         }
663     }
664 
665 	nvAssert (false); // "No candidate found, should never happen (refineone.)";
666 	return FLT_MAX;
667 }
668 
generate_palette_unquantized(const FltEndpts endpts[NREGIONS_ONE],Vector3 palette[NREGIONS_ONE][NINDICES])669 static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_ONE], Vector3 palette[NREGIONS_ONE][NINDICES])
670 {
671     for (int region = 0; region < NREGIONS_ONE; ++region)
672 	for (int i = 0; i < NINDICES; ++i)
673             palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, DENOM);
674 }
675 
676 // generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
map_colors(const Tile & tile,int shapeindex,const FltEndpts endpts[NREGIONS_ONE])677 static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_ONE])
678 {
679     // build list of possibles
680     Vector3 palette[NREGIONS_ONE][NINDICES];
681 
682     generate_palette_unquantized(endpts, palette);
683 
684     float toterr = 0;
685     Vector3 err;
686 
687     for (int y = 0; y < tile.size_y; y++)
688 	for (int x = 0; x < tile.size_x; x++)
689 	{
690         int region = REGION(x,y,shapeindex);
691         float err, besterr;
692 
693         besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
694 
695         for (int i = 1; i < NINDICES && besterr > 0; ++i)
696         {
697             err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
698 
699             if (err > besterr)	// error increased, so we're done searching
700                 break;
701             if (err < besterr)
702                 besterr = err;
703         }
704         toterr += besterr;
705     }
706     return toterr;
707 }
708 
roughone(const Tile & tile,int shapeindex,FltEndpts endpts[NREGIONS_ONE])709 float ZOH::roughone(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_ONE])
710 {
711     for (int region=0; region<NREGIONS_ONE; ++region)
712     {
713         int np = 0;
714         Vector3 colors[Tile::TILE_TOTAL];
715         Vector3 mean(0,0,0);
716 
717         for (int y = 0; y < tile.size_y; y++) {
718             for (int x = 0; x < tile.size_x; x++) {
719                 if (REGION(x,y,shapeindex) == region)
720                 {
721                     colors[np] = tile.data[y][x];
722                     mean += tile.data[y][x];
723                     ++np;
724                 }
725             }
726         }
727 
728         // handle simple cases
729         if (np == 0)
730         {
731             Vector3 zero(0,0,0);
732             endpts[region].A = zero;
733             endpts[region].B = zero;
734             continue;
735         }
736         else if (np == 1)
737         {
738             endpts[region].A = colors[0];
739             endpts[region].B = colors[0];
740             continue;
741         }
742         else if (np == 2)
743         {
744             endpts[region].A = colors[0];
745             endpts[region].B = colors[1];
746             continue;
747         }
748 
749         mean /= float(np);
750 
751         Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
752 
753         // project each pixel value along the principal direction
754         float minp = FLT_MAX, maxp = -FLT_MAX;
755         for (int i = 0; i < np; i++)
756         {
757             float dp = dot(colors[i]-mean, direction);
758             if (dp < minp) minp = dp;
759             if (dp > maxp) maxp = dp;
760         }
761 
762         // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
763         endpts[region].A = mean + minp*direction;
764         endpts[region].B = mean + maxp*direction;
765 
766         // clamp endpoints
767         // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
768         // shape based on endpoints being clamped
769         Utils::clamp(endpts[region].A);
770         Utils::clamp(endpts[region].B);
771     }
772 
773     return map_colors(tile, shapeindex, endpts);
774 }
775 
compressone(const Tile & t,char * block)776 float ZOH::compressone(const Tile &t, char *block)
777 {
778     int shapeindex_best = 0;
779     FltEndpts endptsbest[NREGIONS_ONE], tempendpts[NREGIONS_ONE];
780     float msebest = FLT_MAX;
781 
782     /*
783 		collect the mse values that are within 5% of the best values
784 		optimize each one and choose the best
785 	*/
786     // hack for now -- just use the best value WORK
787     for (int i=0; i<NSHAPES && msebest>0.0; ++i)
788     {
789         float mse = roughone(t, i, tempendpts);
790         if (mse < msebest)
791         {
792             msebest = mse;
793             shapeindex_best = i;
794             memcpy(endptsbest, tempendpts, sizeof(endptsbest));
795         }
796 
797     }
798     return refineone(t, shapeindex_best, endptsbest, block);
799 }
800