1 /*
2 Copyright 2007 nVidia, Inc.
3 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
4
5 You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
6
7 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
8 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
10 See the License for the specific language governing permissions and limitations under the License.
11 */
12
13 // two regions zoh compress/decompress code
14 // Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
15
16 /* optimization algorithm
17
18 get initial float endpoints
19 convert endpoints using 16 bit precision, transform, and get bit delta. choose likely endpoint compression candidates.
20 note that there will be 1 or 2 candidates; 2 will be chosen when the delta values are close to the max possible.
21 for each EC candidate in order from max precision to smaller precision
22 convert endpoints using the appropriate precision.
23 optimize the endpoints and minimize square error. save the error and index assignments. apply index compression as well.
24 (thus the endpoints and indices are in final form.)
25 transform and get bit delta.
26 if the bit delta fits, exit
27 if we ended up with no candidates somehow, choose the tail set of EC candidates and retry. this should happen hardly ever.
28 add a state variable to nvDebugCheck we only do this once.
29 convert to bit stream.
30 return the error.
31
32 Global optimization
33 order all tiles based on their errors
34 do something special for high-error tiles
35 the goal here is to try to avoid tiling artifacts. but I think this is a research problem. let's just generate an error image...
36
37 display an image that shows partitioning and precision selected for each tile
38 */
39
40 #include "bits.h"
41 #include "tile.h"
42 #include "zoh.h"
43 #include "zoh_utils.h"
44
45 #include "nvmath/fitting.h"
46 #include "nvmath/vector.inl"
47
48 #include <string.h> // strlen
49 #include <float.h> // FLT_MAX
50
51 using namespace nv;
52 using namespace ZOH;
53
54 #define NINDICES 8
55 #define INDEXBITS 3
56 #define HIGH_INDEXBIT (1<<(INDEXBITS-1))
57 #define DENOM (NINDICES-1)
58
59 // WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
60 // i.e. can we search shapes in a particular order so we can see the global error minima easily and
61 // stop without having to touch all shapes?
62
63 #include "shapes_two.h"
64 // use only the first 32 available shapes
65 #undef NSHAPES
66 #undef SHAPEBITS
67 #define NSHAPES 32
68 #define SHAPEBITS 5
69
70 #define POS_TO_X(pos) ((pos)&3)
71 #define POS_TO_Y(pos) (((pos)>>2)&3)
72
73 #define NDELTA 4
74
75 struct Chanpat
76 {
77 int prec[NDELTA]; // precision pattern for one channel
78 };
79
80 struct Pattern
81 {
82 Chanpat chan[NCHANNELS]; // allow different bit patterns per channel -- but we still want constant precision per channel
83 int transformed; // if 0, deltas are unsigned and no transform; otherwise, signed and transformed
84 int mode; // associated mode value
85 int modebits; // number of mode bits
86 const char *encoding; // verilog description of encoding for this mode
87 };
88
89 #define MAXMODEBITS 5
90 #define MAXMODES (1<<MAXMODEBITS)
91
92 #define NPATTERNS 10
93
94 static const Pattern patterns[NPATTERNS] =
95 {
96 11,5,5,5, 11,4,4,4, 11,4,4,4, 1, 0x02, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],rw[10],rx[4:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
97 11,4,4,4, 11,5,5,5, 11,4,4,4, 1, 0x06, 5, "d[4:0],bz[3],gy[4],rz[3:0],bz[2],bz[0],ry[3:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],gw[10],gx[4:0],gy[3:0],gz[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
98 11,4,4,4, 11,4,4,4, 11,5,5,5, 1, 0x0a, 5, "d[4:0],bz[3],bz[4],rz[3:0],bz[2:1],ry[3:0],by[3:0],bw[10],bx[4:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],by[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
99 10,5,5,5, 10,5,5,5, 10,5,5,5, 1, 0x00, 2, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bw[9:0],gw[9:0],rw[9:0],bz[4],by[4],gy[4],m[1:0]",
100 9,5,5,5, 9,5,5,5, 9,5,5,5, 1, 0x0e, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bw[8:0],gy[4],gw[8:0],by[4],rw[8:0],m[4:0]",
101 8,6,6,6, 8,5,5,5, 8,5,5,5, 1, 0x12, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],rx[5:0],bz[4:3],bw[7:0],gy[4],bz[2],gw[7:0],by[4],gz[4],rw[7:0],m[4:0]",
102 8,5,5,5, 8,6,6,6, 8,5,5,5, 1, 0x16, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],gx[5:0],gy[3:0],gz[4],rx[4:0],bz[4],gz[5],bw[7:0],gy[4],gy[5],gw[7:0],by[4],bz[0],rw[7:0],m[4:0]",
103 8,5,5,5, 8,5,5,5, 8,6,6,6, 1, 0x1a, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bx[5:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bz[5],bw[7:0],gy[4],by[5],gw[7:0],by[4],bz[1],rw[7:0],m[4:0]",
104 7,6,6,6, 7,6,6,6, 7,6,6,6, 1, 0x01, 2, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],bw[6:0],gy[4],bz[2],by[5],gw[6:0],by[4],bz[1:0],rw[6:0],gz[5:4],gy[5],m[1:0]",
105 6,6,6,6, 6,6,6,6, 6,6,6,6, 0, 0x1e, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],gz[5],bw[5:0],gy[4],bz[2],by[5],gy[5],gw[5:0],by[4],bz[1:0],gz[4],rw[5:0],m[4:0]",
106 };
107
108 // mapping of mode to the corresponding index in pattern
109 // UNUSED ZOH MODES are 0x13, 0x17, 0x1b, 0x1f -- return -2 for these
110 static const int mode_to_pat[MAXMODES] = {
111 3, // 0x00
112 8, // 0x01
113 0, // 0x02
114 -1,-1,-1,
115 1, // 0x06
116 -1,-1,-1,
117 2, // 0x0a
118 -1,-1,-1,
119 4, // 0x0e
120 -1,-1,-1,
121 5, // 0x12
122 -2,-1,-1,
123 6, // 0x16
124 -2,-1,-1,
125 7, // 0x1a
126 -2,-1,-1,
127 9, // 0x1e
128 -2
129 };
130
131 #define R_0(ep) (ep)[0].A[i]
132 #define R_1(ep) (ep)[0].B[i]
133 #define R_2(ep) (ep)[1].A[i]
134 #define R_3(ep) (ep)[1].B[i]
135 #define MASK(n) ((1<<(n))-1)
136
137 // compress endpoints
compress_endpts(const IntEndpts in[NREGIONS_TWO],ComprEndpts out[NREGIONS_TWO],const Pattern & p)138 static void compress_endpts(const IntEndpts in[NREGIONS_TWO], ComprEndpts out[NREGIONS_TWO], const Pattern &p)
139 {
140 if (p.transformed)
141 {
142 for (int i=0; i<NCHANNELS; ++i)
143 {
144 R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
145 R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
146 R_2(out) = (R_2(in) - R_0(in)) & MASK(p.chan[i].prec[2]);
147 R_3(out) = (R_3(in) - R_0(in)) & MASK(p.chan[i].prec[3]);
148 }
149 }
150 else
151 {
152 for (int i=0; i<NCHANNELS; ++i)
153 {
154 R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
155 R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
156 R_2(out) = R_2(in) & MASK(p.chan[i].prec[2]);
157 R_3(out) = R_3(in) & MASK(p.chan[i].prec[3]);
158 }
159 }
160 }
161
162 // decompress endpoints
decompress_endpts(const ComprEndpts in[NREGIONS_TWO],IntEndpts out[NREGIONS_TWO],const Pattern & p)163 static void decompress_endpts(const ComprEndpts in[NREGIONS_TWO], IntEndpts out[NREGIONS_TWO], const Pattern &p)
164 {
165 bool issigned = Utils::FORMAT == SIGNED_F16;
166
167 if (p.transformed)
168 {
169 for (int i=0; i<NCHANNELS; ++i)
170 {
171 R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
172 int t;
173 t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
174 t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
175 R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
176 t = SIGN_EXTEND(R_2(in), p.chan[i].prec[2]);
177 t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
178 R_2(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
179 t = SIGN_EXTEND(R_3(in), p.chan[i].prec[3]);
180 t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
181 R_3(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
182 }
183 }
184 else
185 {
186 for (int i=0; i<NCHANNELS; ++i)
187 {
188 R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
189 R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
190 R_2(out) = issigned ? SIGN_EXTEND(R_2(in),p.chan[i].prec[2]) : R_2(in);
191 R_3(out) = issigned ? SIGN_EXTEND(R_3(in),p.chan[i].prec[3]) : R_3(in);
192 }
193 }
194 }
195
quantize_endpts(const FltEndpts endpts[NREGIONS_TWO],int prec,IntEndpts q_endpts[NREGIONS_TWO])196 static void quantize_endpts(const FltEndpts endpts[NREGIONS_TWO], int prec, IntEndpts q_endpts[NREGIONS_TWO])
197 {
198 for (int region = 0; region < NREGIONS_TWO; ++region)
199 {
200 q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, prec);
201 q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, prec);
202 q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, prec);
203 q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, prec);
204 q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, prec);
205 q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, prec);
206 }
207 }
208
209 // swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
swap_indices(IntEndpts endpts[NREGIONS_TWO],int indices[Tile::TILE_H][Tile::TILE_W],int shapeindex)210 static void swap_indices(IntEndpts endpts[NREGIONS_TWO], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
211 {
212 for (int region = 0; region < NREGIONS_TWO; ++region)
213 {
214 int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
215
216 int x = POS_TO_X(position);
217 int y = POS_TO_Y(position);
218 nvDebugCheck(REGION(x,y,shapeindex) == region); // double check the table
219 if (indices[y][x] & HIGH_INDEXBIT)
220 {
221 // high bit is set, swap the endpts and indices for this region
222 int t;
223 for (int i=0; i<NCHANNELS; ++i)
224 {
225 t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
226 }
227
228 for (int y = 0; y < Tile::TILE_H; y++)
229 for (int x = 0; x < Tile::TILE_W; x++)
230 if (REGION(x,y,shapeindex) == region)
231 indices[y][x] = NINDICES - 1 - indices[y][x];
232 }
233 }
234 }
235
236 // endpoints fit only if the compression was lossless
endpts_fit(const IntEndpts orig[NREGIONS_TWO],const ComprEndpts compressed[NREGIONS_TWO],const Pattern & p)237 static bool endpts_fit(const IntEndpts orig[NREGIONS_TWO], const ComprEndpts compressed[NREGIONS_TWO], const Pattern &p)
238 {
239 IntEndpts uncompressed[NREGIONS_TWO];
240
241 decompress_endpts(compressed, uncompressed, p);
242
243 for (int j=0; j<NREGIONS_TWO; ++j)
244 {
245 for (int i=0; i<NCHANNELS; ++i)
246 {
247 if (orig[j].A[i] != uncompressed[j].A[i]) return false;
248 if (orig[j].B[i] != uncompressed[j].B[i]) return false;
249 }
250 }
251 return true;
252 }
253
write_header(const ComprEndpts endpts[NREGIONS_TWO],int shapeindex,const Pattern & p,Bits & out)254 static void write_header(const ComprEndpts endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, Bits &out)
255 {
256 // interpret the verilog backwards and process it
257 int m = p.mode;
258 int d = shapeindex;
259 int rw = endpts[0].A[0], rx = endpts[0].B[0], ry = endpts[1].A[0], rz = endpts[1].B[0];
260 int gw = endpts[0].A[1], gx = endpts[0].B[1], gy = endpts[1].A[1], gz = endpts[1].B[1];
261 int bw = endpts[0].A[2], bx = endpts[0].B[2], by = endpts[1].A[2], bz = endpts[1].B[2];
262 int ptr = int(strlen(p.encoding));
263 while (ptr)
264 {
265 Field field;
266 int endbit, len;
267
268 // !!!UNDONE: get rid of string parsing!!!
269 Utils::parse(p.encoding, ptr, field, endbit, len);
270 switch(field)
271 {
272 case FIELD_M: out.write( m >> endbit, len); break;
273 case FIELD_D: out.write( d >> endbit, len); break;
274 case FIELD_RW: out.write(rw >> endbit, len); break;
275 case FIELD_RX: out.write(rx >> endbit, len); break;
276 case FIELD_RY: out.write(ry >> endbit, len); break;
277 case FIELD_RZ: out.write(rz >> endbit, len); break;
278 case FIELD_GW: out.write(gw >> endbit, len); break;
279 case FIELD_GX: out.write(gx >> endbit, len); break;
280 case FIELD_GY: out.write(gy >> endbit, len); break;
281 case FIELD_GZ: out.write(gz >> endbit, len); break;
282 case FIELD_BW: out.write(bw >> endbit, len); break;
283 case FIELD_BX: out.write(bx >> endbit, len); break;
284 case FIELD_BY: out.write(by >> endbit, len); break;
285 case FIELD_BZ: out.write(bz >> endbit, len); break;
286 default: nvUnreachable();
287 }
288 }
289 }
290
read_header(Bits & in,ComprEndpts endpts[NREGIONS_TWO],int & shapeindex,Pattern & p)291 static bool read_header(Bits &in, ComprEndpts endpts[NREGIONS_TWO], int &shapeindex, Pattern &p)
292 {
293 // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
294 int mode = in.read(2);
295 if (mode != 0x00 && mode != 0x01)
296 mode = (in.read(3) << 2) | mode;
297
298 int pat_index = mode_to_pat[mode];
299
300 if (pat_index == -2)
301 return false; // reserved mode found
302
303 nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS);
304 nvDebugCheck (in.getptr() == patterns[pat_index].modebits);
305
306 p = patterns[pat_index];
307
308 int d;
309 int rw, rx, ry, rz;
310 int gw, gx, gy, gz;
311 int bw, bx, by, bz;
312
313 d = 0;
314 rw = rx = ry = rz = 0;
315 gw = gx = gy = gz = 0;
316 bw = bx = by = bz = 0;
317
318 int ptr = int(strlen(p.encoding));
319
320 while (ptr)
321 {
322 Field field;
323 int endbit, len;
324
325 // !!!UNDONE: get rid of string parsing!!!
326 Utils::parse(p.encoding, ptr, field, endbit, len);
327
328 switch(field)
329 {
330 case FIELD_M: break; // already processed so ignore
331 case FIELD_D: d |= in.read(len) << endbit; break;
332 case FIELD_RW: rw |= in.read(len) << endbit; break;
333 case FIELD_RX: rx |= in.read(len) << endbit; break;
334 case FIELD_RY: ry |= in.read(len) << endbit; break;
335 case FIELD_RZ: rz |= in.read(len) << endbit; break;
336 case FIELD_GW: gw |= in.read(len) << endbit; break;
337 case FIELD_GX: gx |= in.read(len) << endbit; break;
338 case FIELD_GY: gy |= in.read(len) << endbit; break;
339 case FIELD_GZ: gz |= in.read(len) << endbit; break;
340 case FIELD_BW: bw |= in.read(len) << endbit; break;
341 case FIELD_BX: bx |= in.read(len) << endbit; break;
342 case FIELD_BY: by |= in.read(len) << endbit; break;
343 case FIELD_BZ: bz |= in.read(len) << endbit; break;
344 default: nvUnreachable();
345 }
346 }
347
348 nvDebugCheck (in.getptr() == 128 - 46);
349
350 shapeindex = d;
351 endpts[0].A[0] = rw; endpts[0].B[0] = rx; endpts[1].A[0] = ry; endpts[1].B[0] = rz;
352 endpts[0].A[1] = gw; endpts[0].B[1] = gx; endpts[1].A[1] = gy; endpts[1].B[1] = gz;
353 endpts[0].A[2] = bw; endpts[0].B[2] = bx; endpts[1].A[2] = by; endpts[1].B[2] = bz;
354
355 return true;
356 }
357
write_indices(const int indices[Tile::TILE_H][Tile::TILE_W],int shapeindex,Bits & out)358 static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
359 {
360 int positions[NREGIONS_TWO];
361
362 for (int r = 0; r < NREGIONS_TWO; ++r)
363 positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
364
365 for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
366 {
367 int x = POS_TO_X(pos);
368 int y = POS_TO_Y(pos);
369
370 bool match = false;
371
372 for (int r = 0; r < NREGIONS_TWO; ++r)
373 if (positions[r] == pos) { match = true; break; }
374
375 out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
376 }
377 }
378
emit_block(const ComprEndpts compr_endpts[NREGIONS_TWO],int shapeindex,const Pattern & p,const int indices[Tile::TILE_H][Tile::TILE_W],char * block)379 static void emit_block(const ComprEndpts compr_endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
380 {
381 Bits out(block, ZOH::BITSIZE);
382
383 write_header(compr_endpts, shapeindex, p, out);
384
385 write_indices(indices, shapeindex, out);
386
387 nvDebugCheck(out.getptr() == ZOH::BITSIZE);
388 }
389
generate_palette_quantized(const IntEndpts & endpts,int prec,Vector3 palette[NINDICES])390 static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES])
391 {
392 // scale endpoints
393 int a, b; // really need a IntVector3...
394
395 a = Utils::unquantize(endpts.A[0], prec);
396 b = Utils::unquantize(endpts.B[0], prec);
397
398 // interpolate
399 for (int i = 0; i < NINDICES; ++i)
400 palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
401
402 a = Utils::unquantize(endpts.A[1], prec);
403 b = Utils::unquantize(endpts.B[1], prec);
404
405 // interpolate
406 for (int i = 0; i < NINDICES; ++i)
407 palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
408
409 a = Utils::unquantize(endpts.A[2], prec);
410 b = Utils::unquantize(endpts.B[2], prec);
411
412 // interpolate
413 for (int i = 0; i < NINDICES; ++i)
414 palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
415 }
416
read_indices(Bits & in,int shapeindex,int indices[Tile::TILE_H][Tile::TILE_W])417 static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
418 {
419 int positions[NREGIONS_TWO];
420
421 for (int r = 0; r < NREGIONS_TWO; ++r)
422 positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
423
424 for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
425 {
426 int x = POS_TO_X(pos);
427 int y = POS_TO_Y(pos);
428
429 bool match = false;
430
431 for (int r = 0; r < NREGIONS_TWO; ++r)
432 if (positions[r] == pos) { match = true; break; }
433
434 indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
435 }
436 }
437
decompresstwo(const char * block,Tile & t)438 void ZOH::decompresstwo(const char *block, Tile &t)
439 {
440 Bits in(block, ZOH::BITSIZE);
441
442 Pattern p;
443 IntEndpts endpts[NREGIONS_TWO];
444 ComprEndpts compr_endpts[NREGIONS_TWO];
445 int shapeindex;
446
447 if (!read_header(in, compr_endpts, shapeindex, p))
448 {
449 // reserved mode, return all zeroes
450 for (int y = 0; y < Tile::TILE_H; y++)
451 for (int x = 0; x < Tile::TILE_W; x++)
452 t.data[y][x] = Vector3(0.0f);
453
454 return;
455 }
456
457 decompress_endpts(compr_endpts, endpts, p);
458
459 Vector3 palette[NREGIONS_TWO][NINDICES];
460 for (int r = 0; r < NREGIONS_TWO; ++r)
461 generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
462
463 int indices[Tile::TILE_H][Tile::TILE_W];
464
465 read_indices(in, shapeindex, indices);
466
467 nvDebugCheck(in.getptr() == ZOH::BITSIZE);
468
469 // lookup
470 for (int y = 0; y < Tile::TILE_H; y++)
471 for (int x = 0; x < Tile::TILE_W; x++)
472 t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
473 }
474
475 // given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
map_colors(const Vector3 colors[],const float importance[],int np,const IntEndpts & endpts,int prec)476 static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
477 {
478 Vector3 palette[NINDICES];
479 float toterr = 0;
480 Vector3 err;
481
482 generate_palette_quantized(endpts, prec, palette);
483
484 for (int i = 0; i < np; ++i)
485 {
486 float err, besterr;
487
488 besterr = Utils::norm(colors[i], palette[0]) * importance[i];
489
490 for (int j = 1; j < NINDICES && besterr > 0; ++j)
491 {
492 err = Utils::norm(colors[i], palette[j]) * importance[i];
493
494 if (err > besterr) // error increased, so we're done searching
495 break;
496 if (err < besterr)
497 besterr = err;
498 }
499 toterr += besterr;
500 }
501 return toterr;
502 }
503
504 // assign indices given a tile, shape, and quantized endpoints, return toterr for each region
assign_indices(const Tile & tile,int shapeindex,IntEndpts endpts[NREGIONS_TWO],int prec,int indices[Tile::TILE_H][Tile::TILE_W],float toterr[NREGIONS_TWO])505 static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_TWO], int prec,
506 int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_TWO])
507 {
508 // build list of possibles
509 Vector3 palette[NREGIONS_TWO][NINDICES];
510
511 for (int region = 0; region < NREGIONS_TWO; ++region)
512 {
513 generate_palette_quantized(endpts[region], prec, &palette[region][0]);
514 toterr[region] = 0;
515 }
516
517 Vector3 err;
518
519 for (int y = 0; y < tile.size_y; y++)
520 for (int x = 0; x < tile.size_x; x++)
521 {
522 int region = REGION(x,y,shapeindex);
523 float err, besterr;
524
525 besterr = Utils::norm(tile.data[y][x], palette[region][0]);
526 indices[y][x] = 0;
527
528 for (int i = 1; i < NINDICES && besterr > 0; ++i)
529 {
530 err = Utils::norm(tile.data[y][x], palette[region][i]);
531
532 if (err > besterr) // error increased, so we're done searching
533 break;
534 if (err < besterr)
535 {
536 besterr = err;
537 indices[y][x] = i;
538 }
539 }
540 toterr[region] += besterr;
541 }
542 }
543
perturb_one(const Vector3 colors[],const float importance[],int np,int ch,int prec,const IntEndpts & old_endpts,IntEndpts & new_endpts,float old_err,int do_b)544 static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts,
545 float old_err, int do_b)
546 {
547 // we have the old endpoints: old_endpts
548 // we have the perturbed endpoints: new_endpts
549 // we have the temporary endpoints: temp_endpts
550
551 IntEndpts temp_endpts;
552 float min_err = old_err; // start with the best current error
553 int beststep;
554
555 // copy real endpoints so we can perturb them
556 for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
557
558 // do a logarithmic search for the best error for this endpoint (which)
559 for (int step = 1 << (prec-1); step; step >>= 1)
560 {
561 bool improved = false;
562 for (int sign = -1; sign <= 1; sign += 2)
563 {
564 if (do_b == 0)
565 {
566 temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
567 if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
568 continue;
569 }
570 else
571 {
572 temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
573 if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
574 continue;
575 }
576
577 float err = map_colors(colors, importance, np, temp_endpts, prec);
578
579 if (err < min_err)
580 {
581 improved = true;
582 min_err = err;
583 beststep = sign * step;
584 }
585 }
586 // if this was an improvement, move the endpoint and continue search from there
587 if (improved)
588 {
589 if (do_b == 0)
590 new_endpts.A[ch] += beststep;
591 else
592 new_endpts.B[ch] += beststep;
593 }
594 }
595 return min_err;
596 }
597
optimize_one(const Vector3 colors[],const float importance[],int np,float orig_err,const IntEndpts & orig_endpts,int prec,IntEndpts & opt_endpts)598 static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
599 {
600 float opt_err = orig_err;
601 for (int ch = 0; ch < NCHANNELS; ++ch)
602 {
603 opt_endpts.A[ch] = orig_endpts.A[ch];
604 opt_endpts.B[ch] = orig_endpts.B[ch];
605 }
606 /*
607 err0 = perturb(rgb0, delta0)
608 err1 = perturb(rgb1, delta1)
609 if (err0 < err1)
610 if (err0 >= initial_error) break
611 rgb0 += delta0
612 next = 1
613 else
614 if (err1 >= initial_error) break
615 rgb1 += delta1
616 next = 0
617 initial_err = map()
618 for (;;)
619 err = perturb(next ? rgb1:rgb0, delta)
620 if (err >= initial_err) break
621 next? rgb1 : rgb0 += delta
622 initial_err = err
623 */
624 IntEndpts new_a, new_b;
625 IntEndpts new_endpt;
626 int do_b;
627
628 // now optimize each channel separately
629 for (int ch = 0; ch < NCHANNELS; ++ch)
630 {
631 // figure out which endpoint when perturbed gives the most improvement and start there
632 // if we just alternate, we can easily end up in a local minima
633 float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0); // perturb endpt A
634 float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1); // perturb endpt B
635
636 if (err0 < err1)
637 {
638 if (err0 >= opt_err)
639 continue;
640
641 opt_endpts.A[ch] = new_a.A[ch];
642 opt_err = err0;
643 do_b = 1; // do B next
644 }
645 else
646 {
647 if (err1 >= opt_err)
648 continue;
649 opt_endpts.B[ch] = new_b.B[ch];
650 opt_err = err1;
651 do_b = 0; // do A next
652 }
653
654 // now alternate endpoints and keep trying until there is no improvement
655 for (;;)
656 {
657 float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
658 if (err >= opt_err)
659 break;
660 if (do_b == 0)
661 opt_endpts.A[ch] = new_endpt.A[ch];
662 else
663 opt_endpts.B[ch] = new_endpt.B[ch];
664 opt_err = err;
665 do_b = 1 - do_b; // now move the other endpoint
666 }
667 }
668 }
669
optimize_endpts(const Tile & tile,int shapeindex,const float orig_err[NREGIONS_TWO],const IntEndpts orig_endpts[NREGIONS_TWO],int prec,IntEndpts opt_endpts[NREGIONS_TWO])670 static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_TWO],
671 const IntEndpts orig_endpts[NREGIONS_TWO], int prec, IntEndpts opt_endpts[NREGIONS_TWO])
672 {
673 Vector3 pixels[Tile::TILE_TOTAL];
674 float importance[Tile::TILE_TOTAL];
675 float err = 0;
676
677 for (int region=0; region<NREGIONS_TWO; ++region)
678 {
679 // collect the pixels in the region
680 int np = 0;
681
682 for (int y = 0; y < tile.size_y; y++)
683 for (int x = 0; x < tile.size_x; x++)
684 if (REGION(x,y,shapeindex) == region)
685 {
686 pixels[np] = tile.data[y][x];
687 importance[np] = tile.importance_map[y][x];
688 ++np;
689 }
690
691 optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
692 }
693 }
694
695 /* optimization algorithm
696 for each pattern
697 convert endpoints using pattern precision
698 assign indices and get initial error
699 compress indices (and possibly reorder endpoints)
700 transform endpoints
701 if transformed endpoints fit pattern
702 get original endpoints back
703 optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
704 compress new indices
705 transform new endpoints
706 if new endpoints fit pattern AND if error is improved
707 emit compressed block with new data
708 else
709 emit compressed block with original data // to try to preserve maximum endpoint precision
710 */
711
refinetwo(const Tile & tile,int shapeindex_best,const FltEndpts endpts[NREGIONS_TWO],char * block)712 float ZOH::refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_TWO], char *block)
713 {
714 float orig_err[NREGIONS_TWO], opt_err[NREGIONS_TWO], orig_toterr, opt_toterr;
715 IntEndpts orig_endpts[NREGIONS_TWO], opt_endpts[NREGIONS_TWO];
716 ComprEndpts compr_orig[NREGIONS_TWO], compr_opt[NREGIONS_TWO];
717 int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
718
719 for (int sp = 0; sp < NPATTERNS; ++sp)
720 {
721 // precisions for all channels need to be the same
722 for (int i=1; i<NCHANNELS; ++i) nvDebugCheck (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
723
724 quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
725 assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
726 swap_indices(orig_endpts, orig_indices, shapeindex_best);
727 compress_endpts(orig_endpts, compr_orig, patterns[sp]);
728 if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
729 {
730 optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
731 assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
732 swap_indices(opt_endpts, opt_indices, shapeindex_best);
733 compress_endpts(opt_endpts, compr_opt, patterns[sp]);
734 orig_toterr = opt_toterr = 0;
735 for (int i=0; i < NREGIONS_TWO; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
736 if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
737 {
738 emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
739 return opt_toterr;
740 }
741 else
742 {
743 // either it stopped fitting when we optimized it, or there was no improvement
744 // so go back to the unoptimized endpoints which we know will fit
745 emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
746 return orig_toterr;
747 }
748 }
749 }
750 nvAssert(false); //throw "No candidate found, should never happen (refinetwo.)";
751 return FLT_MAX;
752 }
753
generate_palette_unquantized(const FltEndpts endpts[NREGIONS_TWO],Vector3 palette[NREGIONS_TWO][NINDICES])754 static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_TWO], Vector3 palette[NREGIONS_TWO][NINDICES])
755 {
756 for (int region = 0; region < NREGIONS_TWO; ++region)
757 for (int i = 0; i < NINDICES; ++i)
758 palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, DENOM);
759 }
760
761 // generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
map_colors(const Tile & tile,int shapeindex,const FltEndpts endpts[NREGIONS_TWO])762 static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_TWO])
763 {
764 // build list of possibles
765 Vector3 palette[NREGIONS_TWO][NINDICES];
766
767 generate_palette_unquantized(endpts, palette);
768
769 float toterr = 0;
770 Vector3 err;
771
772 for (int y = 0; y < tile.size_y; y++)
773 for (int x = 0; x < tile.size_x; x++)
774 {
775 int region = REGION(x,y,shapeindex);
776 float err, besterr;
777
778 besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
779
780 for (int i = 1; i < NINDICES && besterr > 0; ++i)
781 {
782 err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
783
784 if (err > besterr) // error increased, so we're done searching
785 break;
786 if (err < besterr)
787 besterr = err;
788 }
789 toterr += besterr;
790 }
791 return toterr;
792 }
793
roughtwo(const Tile & tile,int shapeindex,FltEndpts endpts[NREGIONS_TWO])794 float ZOH::roughtwo(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_TWO])
795 {
796 for (int region=0; region<NREGIONS_TWO; ++region)
797 {
798 int np = 0;
799 Vector3 colors[Tile::TILE_TOTAL];
800 Vector3 mean(0,0,0);
801
802 for (int y = 0; y < tile.size_y; y++)
803 for (int x = 0; x < tile.size_x; x++)
804 if (REGION(x,y,shapeindex) == region)
805 {
806 colors[np] = tile.data[y][x];
807 mean += tile.data[y][x];
808 ++np;
809 }
810
811 // handle simple cases
812 if (np == 0)
813 {
814 Vector3 zero(0,0,0);
815 endpts[region].A = zero;
816 endpts[region].B = zero;
817 continue;
818 }
819 else if (np == 1)
820 {
821 endpts[region].A = colors[0];
822 endpts[region].B = colors[0];
823 continue;
824 }
825 else if (np == 2)
826 {
827 endpts[region].A = colors[0];
828 endpts[region].B = colors[1];
829 continue;
830 }
831
832 mean /= float(np);
833
834 Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
835
836 // project each pixel value along the principal direction
837 float minp = FLT_MAX, maxp = -FLT_MAX;
838 for (int i = 0; i < np; i++)
839 {
840 float dp = dot(colors[i]-mean, direction);
841 if (dp < minp) minp = dp;
842 if (dp > maxp) maxp = dp;
843 }
844
845 // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
846 endpts[region].A = mean + minp*direction;
847 endpts[region].B = mean + maxp*direction;
848
849 // clamp endpoints
850 // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
851 // shape based on endpoints being clamped
852 Utils::clamp(endpts[region].A);
853 Utils::clamp(endpts[region].B);
854 }
855
856 return map_colors(tile, shapeindex, endpts);
857 }
858
compresstwo(const Tile & t,char * block)859 float ZOH::compresstwo(const Tile &t, char *block)
860 {
861 int shapeindex_best = 0;
862 FltEndpts endptsbest[NREGIONS_TWO], tempendpts[NREGIONS_TWO];
863 float msebest = FLT_MAX;
864
865 /*
866 collect the mse values that are within 5% of the best values
867 optimize each one and choose the best
868 */
869 // hack for now -- just use the best value WORK
870 for (int i=0; i<NSHAPES && msebest>0.0; ++i)
871 {
872 float mse = roughtwo(t, i, tempendpts);
873 if (mse < msebest)
874 {
875 msebest = mse;
876 shapeindex_best = i;
877 memcpy(endptsbest, tempendpts, sizeof(endptsbest));
878 }
879
880 }
881 return refinetwo(t, shapeindex_best, endptsbest, block);
882 }
883
884