1 /*
2 Copyright 2007 nVidia, Inc.
3 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
4 
5 You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
6 
7 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
8 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 
10 See the License for the specific language governing permissions and limitations under the License.
11 */
12 
13 // two regions zoh compress/decompress code
14 // Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
15 
16 /* optimization algorithm
17 
18 	get initial float endpoints
19 	convert endpoints using 16 bit precision, transform, and get bit delta. choose likely endpoint compression candidates.
20 		note that there will be 1 or 2 candidates; 2 will be chosen when the delta values are close to the max possible.
21 	for each EC candidate in order from max precision to smaller precision
22 		convert endpoints using the appropriate precision.
23 		optimize the endpoints and minimize square error. save the error and index assignments. apply index compression as well.
24 			(thus the endpoints and indices are in final form.)
25 		transform and get bit delta.
26 		if the bit delta fits, exit
27 	if we ended up with no candidates somehow, choose the tail set of EC candidates and retry. this should happen hardly ever.
28 		add a state variable to nvDebugCheck we only do this once.
29 	convert to bit stream.
30 	return the error.
31 
32 	Global optimization
33 		order all tiles based on their errors
34 		do something special for high-error tiles
35 			the goal here is to try to avoid tiling artifacts. but I think this is a research problem. let's just generate an error image...
36 
37 	display an image that shows partitioning and precision selected for each tile
38 */
39 
40 #include "bits.h"
41 #include "tile.h"
42 #include "zoh.h"
43 #include "zoh_utils.h"
44 
45 #include "nvmath/fitting.h"
46 #include "nvmath/vector.inl"
47 
48 #include <string.h> // strlen
49 #include <float.h> // FLT_MAX
50 
51 using namespace nv;
52 using namespace ZOH;
53 
54 #define NINDICES	8
55 #define	INDEXBITS	3
56 #define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
57 #define	DENOM		(NINDICES-1)
58 
59 // WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
60 // i.e. can we search shapes in a particular order so we can see the global error minima easily and
61 // stop without having to touch all shapes?
62 
63 #include "shapes_two.h"
64 // use only the first 32 available shapes
65 #undef NSHAPES
66 #undef SHAPEBITS
67 #define NSHAPES 32
68 #define SHAPEBITS 5
69 
70 #define	POS_TO_X(pos)	((pos)&3)
71 #define	POS_TO_Y(pos)	(((pos)>>2)&3)
72 
73 #define	NDELTA	4
74 
75 struct Chanpat
76 {
77     int prec[NDELTA];		// precision pattern for one channel
78 };
79 
80 struct Pattern
81 {
82     Chanpat chan[NCHANNELS];    // allow different bit patterns per channel -- but we still want constant precision per channel
83     int transformed;            // if 0, deltas are unsigned and no transform; otherwise, signed and transformed
84     int mode;                   // associated mode value
85     int modebits;               // number of mode bits
86     const char *encoding;       // verilog description of encoding for this mode
87 };
88 
89 #define MAXMODEBITS	5
90 #define	MAXMODES (1<<MAXMODEBITS)
91 
92 #define	NPATTERNS 10
93 
94 static const Pattern patterns[NPATTERNS] =
95 {
96     11,5,5,5,	11,4,4,4,	11,4,4,4,	1,	0x02, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],rw[10],rx[4:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
97     11,4,4,4,	11,5,5,5,	11,4,4,4,	1,	0x06, 5, "d[4:0],bz[3],gy[4],rz[3:0],bz[2],bz[0],ry[3:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],gw[10],gx[4:0],gy[3:0],gz[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
98     11,4,4,4,	11,4,4,4,	11,5,5,5,	1,	0x0a, 5, "d[4:0],bz[3],bz[4],rz[3:0],bz[2:1],ry[3:0],by[3:0],bw[10],bx[4:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],by[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
99     10,5,5,5,	10,5,5,5,	10,5,5,5,	1,	0x00, 2, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bw[9:0],gw[9:0],rw[9:0],bz[4],by[4],gy[4],m[1:0]",
100     9,5,5,5,	9,5,5,5,	9,5,5,5,	1,	0x0e, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bw[8:0],gy[4],gw[8:0],by[4],rw[8:0],m[4:0]",
101     8,6,6,6,	8,5,5,5,	8,5,5,5,	1,	0x12, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],rx[5:0],bz[4:3],bw[7:0],gy[4],bz[2],gw[7:0],by[4],gz[4],rw[7:0],m[4:0]",
102     8,5,5,5,	8,6,6,6,	8,5,5,5,	1,	0x16, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],gx[5:0],gy[3:0],gz[4],rx[4:0],bz[4],gz[5],bw[7:0],gy[4],gy[5],gw[7:0],by[4],bz[0],rw[7:0],m[4:0]",
103     8,5,5,5,	8,5,5,5,	8,6,6,6,	1,	0x1a, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bx[5:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bz[5],bw[7:0],gy[4],by[5],gw[7:0],by[4],bz[1],rw[7:0],m[4:0]",
104     7,6,6,6,	7,6,6,6,	7,6,6,6,	1,	0x01, 2, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],bw[6:0],gy[4],bz[2],by[5],gw[6:0],by[4],bz[1:0],rw[6:0],gz[5:4],gy[5],m[1:0]",
105     6,6,6,6,	6,6,6,6,	6,6,6,6,	0,	0x1e, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],gz[5],bw[5:0],gy[4],bz[2],by[5],gy[5],gw[5:0],by[4],bz[1:0],gz[4],rw[5:0],m[4:0]",
106 };
107 
108 // mapping of mode to the corresponding index in pattern
109 // UNUSED ZOH MODES are 0x13, 0x17, 0x1b, 0x1f -- return -2 for these
110 static const int mode_to_pat[MAXMODES] = {
111     3,	// 0x00
112     8,	// 0x01
113     0,	// 0x02
114     -1,-1,-1,
115     1,	// 0x06
116     -1,-1,-1,
117     2,	// 0x0a
118     -1,-1,-1,
119     4,	// 0x0e
120     -1,-1,-1,
121     5,	// 0x12
122     -2,-1,-1,
123     6,	// 0x16
124     -2,-1,-1,
125     7,	// 0x1a
126     -2,-1,-1,
127     9,	// 0x1e
128     -2
129 };
130 
131 #define	R_0(ep)	(ep)[0].A[i]
132 #define	R_1(ep)	(ep)[0].B[i]
133 #define	R_2(ep)	(ep)[1].A[i]
134 #define	R_3(ep)	(ep)[1].B[i]
135 #define	MASK(n)	((1<<(n))-1)
136 
137 // compress endpoints
compress_endpts(const IntEndpts in[NREGIONS_TWO],ComprEndpts out[NREGIONS_TWO],const Pattern & p)138 static void compress_endpts(const IntEndpts in[NREGIONS_TWO], ComprEndpts out[NREGIONS_TWO], const Pattern &p)
139 {
140     if (p.transformed)
141     {
142         for (int i=0; i<NCHANNELS; ++i)
143         {
144             R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
145             R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
146             R_2(out) = (R_2(in) - R_0(in)) & MASK(p.chan[i].prec[2]);
147             R_3(out) = (R_3(in) - R_0(in)) & MASK(p.chan[i].prec[3]);
148         }
149     }
150     else
151     {
152         for (int i=0; i<NCHANNELS; ++i)
153         {
154             R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
155             R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
156             R_2(out) = R_2(in) & MASK(p.chan[i].prec[2]);
157             R_3(out) = R_3(in) & MASK(p.chan[i].prec[3]);
158         }
159     }
160 }
161 
162 // decompress endpoints
decompress_endpts(const ComprEndpts in[NREGIONS_TWO],IntEndpts out[NREGIONS_TWO],const Pattern & p)163 static void decompress_endpts(const ComprEndpts in[NREGIONS_TWO], IntEndpts out[NREGIONS_TWO], const Pattern &p)
164 {
165     bool issigned = Utils::FORMAT == SIGNED_F16;
166 
167     if (p.transformed)
168     {
169         for (int i=0; i<NCHANNELS; ++i)
170         {
171             R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
172             int t;
173             t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
174             t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
175             R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
176             t = SIGN_EXTEND(R_2(in), p.chan[i].prec[2]);
177             t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
178             R_2(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
179             t = SIGN_EXTEND(R_3(in), p.chan[i].prec[3]);
180             t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
181             R_3(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
182         }
183     }
184     else
185     {
186         for (int i=0; i<NCHANNELS; ++i)
187         {
188             R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
189             R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
190             R_2(out) = issigned ? SIGN_EXTEND(R_2(in),p.chan[i].prec[2]) : R_2(in);
191             R_3(out) = issigned ? SIGN_EXTEND(R_3(in),p.chan[i].prec[3]) : R_3(in);
192         }
193     }
194 }
195 
quantize_endpts(const FltEndpts endpts[NREGIONS_TWO],int prec,IntEndpts q_endpts[NREGIONS_TWO])196 static void quantize_endpts(const FltEndpts endpts[NREGIONS_TWO], int prec, IntEndpts q_endpts[NREGIONS_TWO])
197 {
198     for (int region = 0; region < NREGIONS_TWO; ++region)
199     {
200         q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, prec);
201         q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, prec);
202         q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, prec);
203         q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, prec);
204         q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, prec);
205         q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, prec);
206     }
207 }
208 
209 // swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
swap_indices(IntEndpts endpts[NREGIONS_TWO],int indices[Tile::TILE_H][Tile::TILE_W],int shapeindex)210 static void swap_indices(IntEndpts endpts[NREGIONS_TWO], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
211 {
212     for (int region = 0; region < NREGIONS_TWO; ++region)
213     {
214         int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
215 
216         int x = POS_TO_X(position);
217         int y = POS_TO_Y(position);
218         nvDebugCheck(REGION(x,y,shapeindex) == region);		// double check the table
219         if (indices[y][x] & HIGH_INDEXBIT)
220         {
221             // high bit is set, swap the endpts and indices for this region
222             int t;
223             for (int i=0; i<NCHANNELS; ++i)
224             {
225                 t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
226             }
227 
228             for (int y = 0; y < Tile::TILE_H; y++)
229                 for (int x = 0; x < Tile::TILE_W; x++)
230                     if (REGION(x,y,shapeindex) == region)
231                         indices[y][x] = NINDICES - 1 - indices[y][x];
232         }
233     }
234 }
235 
236 // endpoints fit only if the compression was lossless
endpts_fit(const IntEndpts orig[NREGIONS_TWO],const ComprEndpts compressed[NREGIONS_TWO],const Pattern & p)237 static bool endpts_fit(const IntEndpts orig[NREGIONS_TWO], const ComprEndpts compressed[NREGIONS_TWO], const Pattern &p)
238 {
239     IntEndpts uncompressed[NREGIONS_TWO];
240 
241     decompress_endpts(compressed, uncompressed, p);
242 
243     for (int j=0; j<NREGIONS_TWO; ++j)
244     {
245 	for (int i=0; i<NCHANNELS; ++i)
246 	{
247             if (orig[j].A[i] != uncompressed[j].A[i]) return false;
248             if (orig[j].B[i] != uncompressed[j].B[i]) return false;
249         }
250     }
251     return true;
252 }
253 
write_header(const ComprEndpts endpts[NREGIONS_TWO],int shapeindex,const Pattern & p,Bits & out)254 static void write_header(const ComprEndpts endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, Bits &out)
255 {
256     // interpret the verilog backwards and process it
257     int m = p.mode;
258     int d = shapeindex;
259     int rw = endpts[0].A[0], rx = endpts[0].B[0], ry = endpts[1].A[0], rz = endpts[1].B[0];
260     int gw = endpts[0].A[1], gx = endpts[0].B[1], gy = endpts[1].A[1], gz = endpts[1].B[1];
261     int bw = endpts[0].A[2], bx = endpts[0].B[2], by = endpts[1].A[2], bz = endpts[1].B[2];
262     int ptr = int(strlen(p.encoding));
263     while (ptr)
264     {
265         Field field;
266         int endbit, len;
267 
268 		// !!!UNDONE: get rid of string parsing!!!
269         Utils::parse(p.encoding, ptr, field, endbit, len);
270         switch(field)
271         {
272         case FIELD_M:	out.write( m >> endbit, len); break;
273         case FIELD_D:	out.write( d >> endbit, len); break;
274         case FIELD_RW:	out.write(rw >> endbit, len); break;
275         case FIELD_RX:	out.write(rx >> endbit, len); break;
276         case FIELD_RY:	out.write(ry >> endbit, len); break;
277         case FIELD_RZ:	out.write(rz >> endbit, len); break;
278         case FIELD_GW:	out.write(gw >> endbit, len); break;
279         case FIELD_GX:	out.write(gx >> endbit, len); break;
280         case FIELD_GY:	out.write(gy >> endbit, len); break;
281         case FIELD_GZ:	out.write(gz >> endbit, len); break;
282         case FIELD_BW:	out.write(bw >> endbit, len); break;
283         case FIELD_BX:	out.write(bx >> endbit, len); break;
284         case FIELD_BY:	out.write(by >> endbit, len); break;
285         case FIELD_BZ:	out.write(bz >> endbit, len); break;
286         default: nvUnreachable();
287         }
288     }
289 }
290 
read_header(Bits & in,ComprEndpts endpts[NREGIONS_TWO],int & shapeindex,Pattern & p)291 static bool read_header(Bits &in, ComprEndpts endpts[NREGIONS_TWO], int &shapeindex, Pattern &p)
292 {
293     // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
294     int mode = in.read(2);
295     if (mode != 0x00 && mode != 0x01)
296         mode = (in.read(3) << 2) | mode;
297 
298     int pat_index = mode_to_pat[mode];
299 
300     if (pat_index == -2)
301         return false;		// reserved mode found
302 
303     nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS);
304     nvDebugCheck (in.getptr() == patterns[pat_index].modebits);
305 
306     p = patterns[pat_index];
307 
308     int d;
309     int rw, rx, ry, rz;
310     int gw, gx, gy, gz;
311     int bw, bx, by, bz;
312 
313     d = 0;
314     rw = rx = ry = rz = 0;
315     gw = gx = gy = gz = 0;
316     bw = bx = by = bz = 0;
317 
318     int ptr = int(strlen(p.encoding));
319 
320     while (ptr)
321     {
322         Field field;
323         int endbit, len;
324 
325 		// !!!UNDONE: get rid of string parsing!!!
326         Utils::parse(p.encoding, ptr, field, endbit, len);
327 
328         switch(field)
329         {
330         case FIELD_M:	break;	// already processed so ignore
331         case FIELD_D:	 d |= in.read(len) << endbit; break;
332         case FIELD_RW:	rw |= in.read(len) << endbit; break;
333         case FIELD_RX:	rx |= in.read(len) << endbit; break;
334         case FIELD_RY:	ry |= in.read(len) << endbit; break;
335         case FIELD_RZ:	rz |= in.read(len) << endbit; break;
336         case FIELD_GW:	gw |= in.read(len) << endbit; break;
337         case FIELD_GX:	gx |= in.read(len) << endbit; break;
338         case FIELD_GY:	gy |= in.read(len) << endbit; break;
339         case FIELD_GZ:	gz |= in.read(len) << endbit; break;
340         case FIELD_BW:	bw |= in.read(len) << endbit; break;
341         case FIELD_BX:	bx |= in.read(len) << endbit; break;
342         case FIELD_BY:	by |= in.read(len) << endbit; break;
343         case FIELD_BZ:	bz |= in.read(len) << endbit; break;
344         default: nvUnreachable();
345         }
346     }
347 
348     nvDebugCheck (in.getptr() == 128 - 46);
349 
350     shapeindex = d;
351     endpts[0].A[0] = rw; endpts[0].B[0] = rx; endpts[1].A[0] = ry; endpts[1].B[0] = rz;
352     endpts[0].A[1] = gw; endpts[0].B[1] = gx; endpts[1].A[1] = gy; endpts[1].B[1] = gz;
353     endpts[0].A[2] = bw; endpts[0].B[2] = bx; endpts[1].A[2] = by; endpts[1].B[2] = bz;
354 
355     return true;
356 }
357 
write_indices(const int indices[Tile::TILE_H][Tile::TILE_W],int shapeindex,Bits & out)358 static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
359 {
360     int positions[NREGIONS_TWO];
361 
362     for (int r = 0; r < NREGIONS_TWO; ++r)
363         positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
364 
365     for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
366     {
367         int x = POS_TO_X(pos);
368         int y = POS_TO_Y(pos);
369 
370         bool match = false;
371 
372         for (int r = 0; r < NREGIONS_TWO; ++r)
373             if (positions[r] == pos) { match = true; break; }
374 
375         out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
376     }
377 }
378 
emit_block(const ComprEndpts compr_endpts[NREGIONS_TWO],int shapeindex,const Pattern & p,const int indices[Tile::TILE_H][Tile::TILE_W],char * block)379 static void emit_block(const ComprEndpts compr_endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
380 {
381     Bits out(block, ZOH::BITSIZE);
382 
383     write_header(compr_endpts, shapeindex, p, out);
384 
385     write_indices(indices, shapeindex, out);
386 
387     nvDebugCheck(out.getptr() == ZOH::BITSIZE);
388 }
389 
generate_palette_quantized(const IntEndpts & endpts,int prec,Vector3 palette[NINDICES])390 static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES])
391 {
392     // scale endpoints
393     int a, b;			// really need a IntVector3...
394 
395     a = Utils::unquantize(endpts.A[0], prec);
396     b = Utils::unquantize(endpts.B[0], prec);
397 
398     // interpolate
399     for (int i = 0; i < NINDICES; ++i)
400         palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
401 
402     a = Utils::unquantize(endpts.A[1], prec);
403     b = Utils::unquantize(endpts.B[1], prec);
404 
405     // interpolate
406     for (int i = 0; i < NINDICES; ++i)
407         palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
408 
409     a = Utils::unquantize(endpts.A[2], prec);
410     b = Utils::unquantize(endpts.B[2], prec);
411 
412     // interpolate
413     for (int i = 0; i < NINDICES; ++i)
414         palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
415 }
416 
read_indices(Bits & in,int shapeindex,int indices[Tile::TILE_H][Tile::TILE_W])417 static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
418 {
419     int positions[NREGIONS_TWO];
420 
421     for (int r = 0; r < NREGIONS_TWO; ++r)
422         positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
423 
424     for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
425     {
426         int x = POS_TO_X(pos);
427         int y = POS_TO_Y(pos);
428 
429         bool match = false;
430 
431         for (int r = 0; r < NREGIONS_TWO; ++r)
432             if (positions[r] == pos) { match = true; break; }
433 
434         indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
435     }
436 }
437 
decompresstwo(const char * block,Tile & t)438 void ZOH::decompresstwo(const char *block, Tile &t)
439 {
440     Bits in(block, ZOH::BITSIZE);
441 
442     Pattern p;
443     IntEndpts endpts[NREGIONS_TWO];
444     ComprEndpts compr_endpts[NREGIONS_TWO];
445     int shapeindex;
446 
447     if (!read_header(in, compr_endpts, shapeindex, p))
448     {
449         // reserved mode, return all zeroes
450         for (int y = 0; y < Tile::TILE_H; y++)
451             for (int x = 0; x < Tile::TILE_W; x++)
452                 t.data[y][x] = Vector3(0.0f);
453 
454         return;
455     }
456 
457     decompress_endpts(compr_endpts, endpts, p);
458 
459     Vector3 palette[NREGIONS_TWO][NINDICES];
460     for (int r = 0; r < NREGIONS_TWO; ++r)
461         generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
462 
463     int indices[Tile::TILE_H][Tile::TILE_W];
464 
465     read_indices(in, shapeindex, indices);
466 
467     nvDebugCheck(in.getptr() == ZOH::BITSIZE);
468 
469     // lookup
470     for (int y = 0; y < Tile::TILE_H; y++)
471 	for (int x = 0; x < Tile::TILE_W; x++)
472         t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
473 }
474 
475 // given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
map_colors(const Vector3 colors[],const float importance[],int np,const IntEndpts & endpts,int prec)476 static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
477 {
478     Vector3 palette[NINDICES];
479     float toterr = 0;
480     Vector3 err;
481 
482     generate_palette_quantized(endpts, prec, palette);
483 
484     for (int i = 0; i < np; ++i)
485     {
486         float err, besterr;
487 
488         besterr = Utils::norm(colors[i], palette[0]) * importance[i];
489 
490         for (int j = 1; j < NINDICES && besterr > 0; ++j)
491         {
492             err = Utils::norm(colors[i], palette[j]) * importance[i];
493 
494             if (err > besterr)	// error increased, so we're done searching
495                 break;
496             if (err < besterr)
497                 besterr = err;
498         }
499         toterr += besterr;
500     }
501     return toterr;
502 }
503 
504 // assign indices given a tile, shape, and quantized endpoints, return toterr for each region
assign_indices(const Tile & tile,int shapeindex,IntEndpts endpts[NREGIONS_TWO],int prec,int indices[Tile::TILE_H][Tile::TILE_W],float toterr[NREGIONS_TWO])505 static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_TWO], int prec,
506                            int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_TWO])
507 {
508     // build list of possibles
509     Vector3 palette[NREGIONS_TWO][NINDICES];
510 
511     for (int region = 0; region < NREGIONS_TWO; ++region)
512     {
513         generate_palette_quantized(endpts[region], prec, &palette[region][0]);
514         toterr[region] = 0;
515     }
516 
517     Vector3 err;
518 
519     for (int y = 0; y < tile.size_y; y++)
520 	for (int x = 0; x < tile.size_x; x++)
521 	{
522         int region = REGION(x,y,shapeindex);
523         float err, besterr;
524 
525         besterr = Utils::norm(tile.data[y][x], palette[region][0]);
526         indices[y][x] = 0;
527 
528         for (int i = 1; i < NINDICES && besterr > 0; ++i)
529         {
530             err = Utils::norm(tile.data[y][x], palette[region][i]);
531 
532             if (err > besterr)	// error increased, so we're done searching
533                 break;
534             if (err < besterr)
535             {
536                 besterr = err;
537                 indices[y][x] = i;
538             }
539         }
540         toterr[region] += besterr;
541     }
542 }
543 
perturb_one(const Vector3 colors[],const float importance[],int np,int ch,int prec,const IntEndpts & old_endpts,IntEndpts & new_endpts,float old_err,int do_b)544 static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts,
545                           float old_err, int do_b)
546 {
547     // we have the old endpoints: old_endpts
548     // we have the perturbed endpoints: new_endpts
549     // we have the temporary endpoints: temp_endpts
550 
551     IntEndpts temp_endpts;
552     float min_err = old_err;		// start with the best current error
553     int beststep;
554 
555     // copy real endpoints so we can perturb them
556     for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
557 
558     // do a logarithmic search for the best error for this endpoint (which)
559     for (int step = 1 << (prec-1); step; step >>= 1)
560     {
561         bool improved = false;
562         for (int sign = -1; sign <= 1; sign += 2)
563         {
564             if (do_b == 0)
565             {
566                 temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
567                 if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
568                     continue;
569             }
570             else
571             {
572                 temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
573                 if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
574                     continue;
575             }
576 
577             float err = map_colors(colors, importance, np, temp_endpts, prec);
578 
579             if (err < min_err)
580             {
581                 improved = true;
582                 min_err = err;
583                 beststep = sign * step;
584             }
585         }
586         // if this was an improvement, move the endpoint and continue search from there
587         if (improved)
588         {
589             if (do_b == 0)
590                 new_endpts.A[ch] += beststep;
591             else
592                 new_endpts.B[ch] += beststep;
593         }
594     }
595     return min_err;
596 }
597 
optimize_one(const Vector3 colors[],const float importance[],int np,float orig_err,const IntEndpts & orig_endpts,int prec,IntEndpts & opt_endpts)598 static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
599 {
600     float opt_err = orig_err;
601     for (int ch = 0; ch < NCHANNELS; ++ch)
602     {
603         opt_endpts.A[ch] = orig_endpts.A[ch];
604         opt_endpts.B[ch] = orig_endpts.B[ch];
605     }
606     /*
607         err0 = perturb(rgb0, delta0)
608         err1 = perturb(rgb1, delta1)
609         if (err0 < err1)
610             if (err0 >= initial_error) break
611             rgb0 += delta0
612             next = 1
613         else
614             if (err1 >= initial_error) break
615             rgb1 += delta1
616             next = 0
617         initial_err = map()
618         for (;;)
619             err = perturb(next ? rgb1:rgb0, delta)
620             if (err >= initial_err) break
621             next? rgb1 : rgb0 += delta
622             initial_err = err
623     */
624     IntEndpts new_a, new_b;
625     IntEndpts new_endpt;
626     int do_b;
627 
628     // now optimize each channel separately
629     for (int ch = 0; ch < NCHANNELS; ++ch)
630     {
631         // figure out which endpoint when perturbed gives the most improvement and start there
632         // if we just alternate, we can easily end up in a local minima
633         float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0);	// perturb endpt A
634         float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1);	// perturb endpt B
635 
636         if (err0 < err1)
637         {
638             if (err0 >= opt_err)
639                 continue;
640 
641             opt_endpts.A[ch] = new_a.A[ch];
642             opt_err = err0;
643             do_b = 1;		// do B next
644         }
645         else
646         {
647             if (err1 >= opt_err)
648                 continue;
649             opt_endpts.B[ch] = new_b.B[ch];
650             opt_err = err1;
651             do_b = 0;		// do A next
652         }
653 
654         // now alternate endpoints and keep trying until there is no improvement
655         for (;;)
656         {
657             float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
658             if (err >= opt_err)
659                 break;
660             if (do_b == 0)
661                 opt_endpts.A[ch] = new_endpt.A[ch];
662             else
663                 opt_endpts.B[ch] = new_endpt.B[ch];
664             opt_err = err;
665             do_b = 1 - do_b;	// now move the other endpoint
666         }
667     }
668 }
669 
optimize_endpts(const Tile & tile,int shapeindex,const float orig_err[NREGIONS_TWO],const IntEndpts orig_endpts[NREGIONS_TWO],int prec,IntEndpts opt_endpts[NREGIONS_TWO])670 static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_TWO],
671                             const IntEndpts orig_endpts[NREGIONS_TWO], int prec, IntEndpts opt_endpts[NREGIONS_TWO])
672 {
673     Vector3 pixels[Tile::TILE_TOTAL];
674     float importance[Tile::TILE_TOTAL];
675     float err = 0;
676 
677     for (int region=0; region<NREGIONS_TWO; ++region)
678     {
679         // collect the pixels in the region
680         int np = 0;
681 
682         for (int y = 0; y < tile.size_y; y++)
683             for (int x = 0; x < tile.size_x; x++)
684                 if (REGION(x,y,shapeindex) == region)
685                 {
686             pixels[np] = tile.data[y][x];
687             importance[np] = tile.importance_map[y][x];
688             ++np;
689         }
690 
691         optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
692     }
693 }
694 
695 /* optimization algorithm
696     for each pattern
697         convert endpoints using pattern precision
698         assign indices and get initial error
699         compress indices (and possibly reorder endpoints)
700         transform endpoints
701         if transformed endpoints fit pattern
702             get original endpoints back
703             optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
704             compress new indices
705             transform new endpoints
706             if new endpoints fit pattern AND if error is improved
707                 emit compressed block with new data
708             else
709                 emit compressed block with original data // to try to preserve maximum endpoint precision
710 */
711 
refinetwo(const Tile & tile,int shapeindex_best,const FltEndpts endpts[NREGIONS_TWO],char * block)712 float ZOH::refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_TWO], char *block)
713 {
714     float orig_err[NREGIONS_TWO], opt_err[NREGIONS_TWO], orig_toterr, opt_toterr;
715     IntEndpts orig_endpts[NREGIONS_TWO], opt_endpts[NREGIONS_TWO];
716     ComprEndpts compr_orig[NREGIONS_TWO], compr_opt[NREGIONS_TWO];
717     int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
718 
719     for (int sp = 0; sp < NPATTERNS; ++sp)
720     {
721         // precisions for all channels need to be the same
722         for (int i=1; i<NCHANNELS; ++i) nvDebugCheck (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
723 
724         quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
725         assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
726         swap_indices(orig_endpts, orig_indices, shapeindex_best);
727         compress_endpts(orig_endpts, compr_orig, patterns[sp]);
728         if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
729         {
730             optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
731             assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
732             swap_indices(opt_endpts, opt_indices, shapeindex_best);
733             compress_endpts(opt_endpts, compr_opt, patterns[sp]);
734             orig_toterr = opt_toterr = 0;
735             for (int i=0; i < NREGIONS_TWO; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
736             if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
737             {
738                 emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
739                 return opt_toterr;
740             }
741             else
742             {
743                 // either it stopped fitting when we optimized it, or there was no improvement
744                 // so go back to the unoptimized endpoints which we know will fit
745                 emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
746                 return orig_toterr;
747             }
748         }
749     }
750     nvAssert(false); //throw "No candidate found, should never happen (refinetwo.)";
751 	return FLT_MAX;
752 }
753 
generate_palette_unquantized(const FltEndpts endpts[NREGIONS_TWO],Vector3 palette[NREGIONS_TWO][NINDICES])754 static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_TWO], Vector3 palette[NREGIONS_TWO][NINDICES])
755 {
756     for (int region = 0; region < NREGIONS_TWO; ++region)
757 	for (int i = 0; i < NINDICES; ++i)
758             palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, DENOM);
759 }
760 
761 // generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
map_colors(const Tile & tile,int shapeindex,const FltEndpts endpts[NREGIONS_TWO])762 static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_TWO])
763 {
764     // build list of possibles
765     Vector3 palette[NREGIONS_TWO][NINDICES];
766 
767     generate_palette_unquantized(endpts, palette);
768 
769     float toterr = 0;
770     Vector3 err;
771 
772     for (int y = 0; y < tile.size_y; y++)
773 	for (int x = 0; x < tile.size_x; x++)
774 	{
775         int region = REGION(x,y,shapeindex);
776         float err, besterr;
777 
778         besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
779 
780         for (int i = 1; i < NINDICES && besterr > 0; ++i)
781         {
782             err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
783 
784             if (err > besterr)	// error increased, so we're done searching
785                 break;
786             if (err < besterr)
787                 besterr = err;
788         }
789         toterr += besterr;
790     }
791     return toterr;
792 }
793 
roughtwo(const Tile & tile,int shapeindex,FltEndpts endpts[NREGIONS_TWO])794 float ZOH::roughtwo(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_TWO])
795 {
796     for (int region=0; region<NREGIONS_TWO; ++region)
797     {
798         int np = 0;
799         Vector3 colors[Tile::TILE_TOTAL];
800         Vector3 mean(0,0,0);
801 
802         for (int y = 0; y < tile.size_y; y++)
803             for (int x = 0; x < tile.size_x; x++)
804                 if (REGION(x,y,shapeindex) == region)
805                 {
806             colors[np] = tile.data[y][x];
807             mean += tile.data[y][x];
808             ++np;
809         }
810 
811         // handle simple cases
812         if (np == 0)
813         {
814             Vector3 zero(0,0,0);
815             endpts[region].A = zero;
816             endpts[region].B = zero;
817             continue;
818         }
819         else if (np == 1)
820         {
821             endpts[region].A = colors[0];
822             endpts[region].B = colors[0];
823             continue;
824         }
825         else if (np == 2)
826         {
827             endpts[region].A = colors[0];
828             endpts[region].B = colors[1];
829             continue;
830         }
831 
832         mean /= float(np);
833 
834         Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
835 
836         // project each pixel value along the principal direction
837         float minp = FLT_MAX, maxp = -FLT_MAX;
838         for (int i = 0; i < np; i++)
839         {
840             float dp = dot(colors[i]-mean, direction);
841             if (dp < minp) minp = dp;
842             if (dp > maxp) maxp = dp;
843         }
844 
845         // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
846         endpts[region].A = mean + minp*direction;
847         endpts[region].B = mean + maxp*direction;
848 
849         // clamp endpoints
850         // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
851         // shape based on endpoints being clamped
852         Utils::clamp(endpts[region].A);
853         Utils::clamp(endpts[region].B);
854     }
855 
856     return map_colors(tile, shapeindex, endpts);
857 }
858 
compresstwo(const Tile & t,char * block)859 float ZOH::compresstwo(const Tile &t, char *block)
860 {
861     int shapeindex_best = 0;
862     FltEndpts endptsbest[NREGIONS_TWO], tempendpts[NREGIONS_TWO];
863     float msebest = FLT_MAX;
864 
865     /*
866     collect the mse values that are within 5% of the best values
867     optimize each one and choose the best
868     */
869     // hack for now -- just use the best value WORK
870     for (int i=0; i<NSHAPES && msebest>0.0; ++i)
871     {
872         float mse = roughtwo(t, i, tempendpts);
873         if (mse < msebest)
874         {
875             msebest = mse;
876             shapeindex_best = i;
877             memcpy(endptsbest, tempendpts, sizeof(endptsbest));
878         }
879 
880     }
881     return refinetwo(t, shapeindex_best, endptsbest, block);
882 }
883 
884