1 /*----------------------------------------------------------------------------*/
2 /**
3 * This confidential and proprietary software may be used only as
4 * authorised by a licensing agreement from ARM Limited
5 * (C) COPYRIGHT 2011-2012 ARM Limited
6 * ALL RIGHTS RESERVED
7 *
8 * The entire notice above must be reproduced on all authorised
9 * copies and copies may only be made to the extent permitted
10 * by a licensing agreement from ARM Limited.
11 *
12 * @brief approximate k-means cluster partitioning. Do this in 2 stages
13 *
14 * 1: basic clustering, a couple of passes just to get a few clusters
15 * 2: clustering based on line, a few passes until it seems to
16 * stabilize.
17 *
18 * After clustering is done, we use the clustering result to construct
19 * one bitmap for each partition. We then scan though the partition table,
20 * counting how well the bitmaps matched.
21 */
22 /*----------------------------------------------------------------------------*/
23
24 #include "astc_codec_internals.h"
25
26 // for k++ means, we need pseudo-random numbers, however using random numbers directly
27 // results in irreproducible encoding results. As such, we will instead
28 // just supply a handful of numbers from random.org, and apply an algorithm similar
29 // to XKCD #221. (http://xkcd.com/221/)
30 // cluster the texels using the k++ means clustering initialization algorithm.
31
kpp_initialize(int xdim,int ydim,int zdim,int partition_count,const imageblock * blk,float4 * cluster_centers)32 void kpp_initialize(int xdim, int ydim, int zdim, int partition_count, const imageblock * blk, float4 * cluster_centers)
33 {
34 int i;
35
36 int texels_per_block = xdim * ydim * zdim;
37
38 int cluster_center_samples[4];
39 // pick a random sample as first center-point.
40 cluster_center_samples[0] = 145897 /* number from random.org */ % texels_per_block;
41 int samples_selected = 1;
42
43 float distances[MAX_TEXELS_PER_BLOCK];
44
45 // compute the distance to the first point.
46 int sample = cluster_center_samples[0];
47 float4 center_color = float4(blk->work_data[4 * sample],
48 blk->work_data[4 * sample + 1],
49 blk->work_data[4 * sample + 2],
50 blk->work_data[4 * sample + 3]);
51
52 float distance_sum = 0.0f;
53 for (i = 0; i < texels_per_block; i++)
54 {
55 float4 color = float4(blk->work_data[4 * i],
56 blk->work_data[4 * i + 1],
57 blk->work_data[4 * i + 2],
58 blk->work_data[4 * i + 3]);
59 float4 diff = color - center_color;
60 float distance = dot(diff, diff);
61 distance_sum += distance;
62 distances[i] = distance;
63 }
64
65 // more numbers from random.org
66 float cluster_cutoffs[25] = {
67 0.952312f, 0.206893f, 0.835984f, 0.507813f, 0.466170f,
68 0.872331f, 0.488028f, 0.866394f, 0.363093f, 0.467905f,
69 0.812967f, 0.626220f, 0.932770f, 0.275454f, 0.832020f,
70 0.362217f, 0.318558f, 0.240113f, 0.009190f, 0.983995f,
71 0.566812f, 0.347661f, 0.731960f, 0.156391f, 0.297786f
72 };
73
74 while (1)
75 {
76 // pick a point in a weighted-random fashion.
77 float summa = 0.0f;
78 float distance_cutoff = distance_sum * cluster_cutoffs[samples_selected + 5 * partition_count];
79 for (i = 0; i < texels_per_block; i++)
80 {
81 summa += distances[i];
82 if (summa >= distance_cutoff)
83 break;
84 }
85 sample = i;
86 if (sample >= texels_per_block)
87 sample = texels_per_block - 1;
88
89
90 cluster_center_samples[samples_selected] = sample;
91 samples_selected++;
92 if (samples_selected >= partition_count)
93 break;
94
95 // update the distances with the new point.
96 center_color = float4(blk->work_data[4 * sample], blk->work_data[4 * sample + 1], blk->work_data[4 * sample + 2], blk->work_data[4 * sample + 3]);
97
98 distance_sum = 0.0f;
99 for (i = 0; i < texels_per_block; i++)
100 {
101 float4 color = float4(blk->work_data[4 * i],
102 blk->work_data[4 * i + 1],
103 blk->work_data[4 * i + 2],
104 blk->work_data[4 * i + 3]);
105 float4 diff = color - center_color;
106 float distance = dot(diff, diff);
107 distance = MIN(distance, distances[i]);
108 distance_sum += distance;
109 distances[i] = distance;
110 }
111 }
112
113 // finally, gather up the results.
114 for (i = 0; i < partition_count; i++)
115 {
116 int sample = cluster_center_samples[i];
117 float4 color = float4(blk->work_data[4 * sample],
118 blk->work_data[4 * sample + 1],
119 blk->work_data[4 * sample + 2],
120 blk->work_data[4 * sample + 3]);
121 cluster_centers[i] = color;
122 }
123 }
124
125
126 // basic K-means clustering: given a set of cluster centers,
127 // assign each texel to a partition
basic_kmeans_assign_pass(int xdim,int ydim,int zdim,int partition_count,const imageblock * blk,const float4 * cluster_centers,int * partition_of_texel)128 void basic_kmeans_assign_pass(int xdim, int ydim, int zdim, int partition_count, const imageblock * blk, const float4 * cluster_centers, int *partition_of_texel)
129 {
130 int i, j;
131
132 int texels_per_block = xdim * ydim * zdim;
133
134 float distances[MAX_TEXELS_PER_BLOCK];
135 float4 center_color = cluster_centers[0];
136
137 int texels_per_partition[4];
138
139 texels_per_partition[0] = texels_per_block;
140 for (i = 1; i < partition_count; i++)
141 texels_per_partition[i] = 0;
142
143
144 for (i = 0; i < texels_per_block; i++)
145 {
146 float4 color = float4(blk->work_data[4 * i],
147 blk->work_data[4 * i + 1],
148 blk->work_data[4 * i + 2],
149 blk->work_data[4 * i + 3]);
150 float4 diff = color - center_color;
151 float distance = dot(diff, diff);
152 distances[i] = distance;
153 partition_of_texel[i] = 0;
154 }
155
156
157
158 for (j = 1; j < partition_count; j++)
159 {
160 float4 center_color = cluster_centers[j];
161
162 for (i = 0; i < texels_per_block; i++)
163 {
164 float4 color = float4(blk->work_data[4 * i],
165 blk->work_data[4 * i + 1],
166 blk->work_data[4 * i + 2],
167 blk->work_data[4 * i + 3]);
168 float4 diff = color - center_color;
169 float distance = dot(diff, diff);
170 if (distance < distances[i])
171 {
172 distances[i] = distance;
173 texels_per_partition[partition_of_texel[i]]--;
174 texels_per_partition[j]++;
175 partition_of_texel[i] = j;
176 }
177 }
178 }
179
180 // it is possible to get a situation where one of the partitions ends up
181 // without any texels. In this case, we assign texel N to partition N;
182 // this is silly, but ensures that every partition retains at least one texel.
183 // Reassigning a texel in this manner may cause another partition to go empty,
184 // so if we actually did a reassignment, we run the whole loop over again.
185 int problem_case;
186 do
187 {
188 problem_case = 0;
189 for (i = 0; i < partition_count; i++)
190 {
191 if (texels_per_partition[i] == 0)
192 {
193 texels_per_partition[partition_of_texel[i]]--;
194 texels_per_partition[i]++;
195 partition_of_texel[i] = i;
196 problem_case = 1;
197 }
198 }
199 }
200 while (problem_case != 0);
201
202 }
203
204
205 // basic k-means clustering: given a set of cluster assignments
206 // for the texels, find the center position of each cluster.
basic_kmeans_update(int xdim,int ydim,int zdim,int partition_count,const imageblock * blk,const int * partition_of_texel,float4 * cluster_centers)207 void basic_kmeans_update(int xdim, int ydim, int zdim, int partition_count, const imageblock * blk, const int *partition_of_texel, float4 * cluster_centers)
208 {
209 int i;
210
211 int texels_per_block = xdim * ydim * zdim;
212
213 float4 color_sum[4];
214 int weight_sum[4];
215
216 for (i = 0; i < partition_count; i++)
217 {
218 color_sum[i] = float4(0, 0, 0, 0);
219 weight_sum[i] = 0;
220 }
221
222
223 // first, find the center-of-gravity in each cluster
224 for (i = 0; i < texels_per_block; i++)
225 {
226 float4 color = float4(blk->work_data[4 * i],
227 blk->work_data[4 * i + 1],
228 blk->work_data[4 * i + 2],
229 blk->work_data[4 * i + 3]);
230 int part = partition_of_texel[i];
231 color_sum[part] = color_sum[part] + color;
232 weight_sum[part]++;
233 }
234
235 for (i = 0; i < partition_count; i++)
236 {
237 cluster_centers[i] = color_sum[i] * (1.0f / weight_sum[i]);
238 }
239 }
240
241
242
243
244 // after a few rounds of k-means-clustering, we should have a set of 2, 3 or 4 partitions;
245 // we then turn this set into 2, 3 or 4 bitmaps. Then, for each of the 1024 partitions,
246 // we try to match the bitmaps as well as possible.
247
248
249
250
bitcount(uint64_t p)251 static inline int bitcount(uint64_t p)
252 {
253 if (sizeof(void *) > 4)
254 {
255 uint64_t mask1 = 0x5555555555555555ULL;
256 uint64_t mask2 = 0x3333333333333333ULL;
257 uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL;
258 // best-known algorithm for 64-bit bitcount, assuming 64-bit processor
259 // should probably be adapted for use with 32-bit processors and/or processors
260 // with a POPCNT instruction, but leave that for later.
261 p -= (p >> 1) & mask1;
262 p = (p & mask2) + ((p >> 2) & mask2);
263 p += p >> 4;
264 p &= mask3;
265 p *= 0x0101010101010101ULL;
266 p >>= 56;
267 return (int)p;
268 }
269 else
270 {
271 // on 32-bit processor, split the 64-bit input argument in two,
272 // and bitcount each half separately.
273 uint32_t p1 = (uint32_t) p;
274 uint32_t p2 = (uint32_t) (p >> 32);
275 uint32_t mask1 = 0x55555555U;
276 uint32_t mask2 = 0x33333333U;
277 uint32_t mask3 = 0x0F0F0F0FU;
278 p1 = p1 - ((p1 >> 1) & mask1);
279 p2 = p2 - ((p2 >> 1) & mask1);
280 p1 = (p1 & mask2) + ((p1 >> 2) & mask2);
281 p2 = (p2 & mask2) + ((p2 >> 2) & mask2);
282 p1 += p1 >> 4;
283 p2 += p2 >> 4;
284 p1 &= mask3;
285 p2 &= mask3;
286 p1 += p2;
287 p1 *= 0x01010101U;
288 p1 >>= 24;
289 return (int)p1;
290 }
291 }
292
293
294 // compute the bit-mismatch for a partitioning in 2-partition mode
partition_mismatch2(uint64_t a0,uint64_t a1,uint64_t b0,uint64_t b1)295 static inline int partition_mismatch2(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
296 {
297 int v1 = bitcount(a0 ^ b0) + bitcount(a1 ^ b1);
298 int v2 = bitcount(a0 ^ b1) + bitcount(a1 ^ b0);
299 return MIN(v1, v2);
300 }
301
302
303 // compute the bit-mismatch for a partitioning in 3-partition mode
partition_mismatch3(uint64_t a0,uint64_t a1,uint64_t a2,uint64_t b0,uint64_t b1,uint64_t b2)304 static inline int partition_mismatch3(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t b0, uint64_t b1, uint64_t b2)
305 {
306 int p00 = bitcount(a0 ^ b0);
307 int p01 = bitcount(a0 ^ b1);
308 int p02 = bitcount(a0 ^ b2);
309
310 int p10 = bitcount(a1 ^ b0);
311 int p11 = bitcount(a1 ^ b1);
312 int p12 = bitcount(a1 ^ b2);
313
314 int p20 = bitcount(a2 ^ b0);
315 int p21 = bitcount(a2 ^ b1);
316 int p22 = bitcount(a2 ^ b2);
317
318 int s0 = p11 + p22;
319 int s1 = p12 + p21;
320 int v0 = MIN(s0, s1) + p00;
321
322 int s2 = p10 + p22;
323 int s3 = p12 + p20;
324 int v1 = MIN(s2, s3) + p01;
325
326 int s4 = p10 + p21;
327 int s5 = p11 + p20;
328 int v2 = MIN(s4, s5) + p02;
329
330 if (v1 < v0)
331 v0 = v1;
332 if (v2 < v0)
333 v0 = v2;
334
335 // 9 add, 5 MIN
336
337 return v0;
338 }
339
MIN3(int a,int b,int c)340 static inline int MIN3(int a, int b, int c)
341 {
342 int d = MIN(a, b);
343 return MIN(c, d);
344 }
345
346 // compute the bit-mismatch for a partitioning in 4-partition mode
partition_mismatch4(uint64_t a0,uint64_t a1,uint64_t a2,uint64_t a3,uint64_t b0,uint64_t b1,uint64_t b2,uint64_t b3)347 static inline int partition_mismatch4(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3, uint64_t b0, uint64_t b1, uint64_t b2, uint64_t b3)
348 {
349 int p00 = bitcount(a0 ^ b0);
350 int p01 = bitcount(a0 ^ b1);
351 int p02 = bitcount(a0 ^ b2);
352 int p03 = bitcount(a0 ^ b3);
353
354 int p10 = bitcount(a1 ^ b0);
355 int p11 = bitcount(a1 ^ b1);
356 int p12 = bitcount(a1 ^ b2);
357 int p13 = bitcount(a1 ^ b3);
358
359 int p20 = bitcount(a2 ^ b0);
360 int p21 = bitcount(a2 ^ b1);
361 int p22 = bitcount(a2 ^ b2);
362 int p23 = bitcount(a2 ^ b3);
363
364 int p30 = bitcount(a3 ^ b0);
365 int p31 = bitcount(a3 ^ b1);
366 int p32 = bitcount(a3 ^ b2);
367 int p33 = bitcount(a3 ^ b3);
368
369 int mx23 = MIN(p22 + p33, p23 + p32);
370 int mx13 = MIN(p21 + p33, p23 + p31);
371 int mx12 = MIN(p21 + p32, p22 + p31);
372 int mx03 = MIN(p20 + p33, p23 + p30);
373 int mx02 = MIN(p20 + p32, p22 + p30);
374 int mx01 = MIN(p21 + p30, p20 + p31);
375
376 int v0 = p00 + MIN3(p11 + mx23, p12 + mx13, p13 + mx12);
377 int v1 = p01 + MIN3(p10 + mx23, p12 + mx03, p13 + mx02);
378 int v2 = p02 + MIN3(p11 + mx03, p10 + mx13, p13 + mx01);
379 int v3 = p03 + MIN3(p11 + mx02, p12 + mx01, p10 + mx12);
380
381 int x0 = MIN(v0, v1);
382 int x1 = MIN(v2, v3);
383 return MIN(x0, x1);
384
385 // 16 bitcount, 17 MIN, 28 ADD
386 }
387
388
389
count_partition_mismatch_bits(int xdim,int ydim,int zdim,int partition_count,const uint64_t bitmaps[4],int bitcounts[PARTITION_COUNT])390 void count_partition_mismatch_bits(int xdim, int ydim, int zdim, int partition_count, const uint64_t bitmaps[4], int bitcounts[PARTITION_COUNT])
391 {
392 int i;
393 const partition_info *pi = get_partition_table(xdim, ydim, zdim, partition_count);
394
395 if (partition_count == 2)
396 {
397 uint64_t bm0 = bitmaps[0];
398 uint64_t bm1 = bitmaps[1];
399 for (i = 0; i < PARTITION_COUNT; i++)
400 {
401 if (pi->partition_count == 2)
402 {
403 bitcounts[i] = partition_mismatch2(bm0, bm1, pi->coverage_bitmaps[0], pi->coverage_bitmaps[1]);
404 }
405 else
406 bitcounts[i] = 255;
407 pi++;
408 }
409 }
410 else if (partition_count == 3)
411 {
412 uint64_t bm0 = bitmaps[0];
413 uint64_t bm1 = bitmaps[1];
414 uint64_t bm2 = bitmaps[2];
415 for (i = 0; i < PARTITION_COUNT; i++)
416 {
417 if (pi->partition_count == 3)
418 {
419 bitcounts[i] = partition_mismatch3(bm0, bm1, bm2, pi->coverage_bitmaps[0], pi->coverage_bitmaps[1], pi->coverage_bitmaps[2]);
420 }
421 else
422 bitcounts[i] = 255;
423 pi++;
424 }
425 }
426 else if (partition_count == 4)
427 {
428 uint64_t bm0 = bitmaps[0];
429 uint64_t bm1 = bitmaps[1];
430 uint64_t bm2 = bitmaps[2];
431 uint64_t bm3 = bitmaps[3];
432 for (i = 0; i < PARTITION_COUNT; i++)
433 {
434 if (pi->partition_count == 4)
435 {
436 bitcounts[i] = partition_mismatch4(bm0, bm1, bm2, bm3, pi->coverage_bitmaps[0], pi->coverage_bitmaps[1], pi->coverage_bitmaps[2], pi->coverage_bitmaps[3]);
437 }
438 else
439 bitcounts[i] = 255;
440 pi++;
441 }
442 }
443
444 }
445
446
447 // counting-sort on the mismatch-bits, thereby
448 // sorting the partitions into an ordering.
449
get_partition_ordering_by_mismatch_bits(const int mismatch_bits[PARTITION_COUNT],int partition_ordering[PARTITION_COUNT])450 void get_partition_ordering_by_mismatch_bits(const int mismatch_bits[PARTITION_COUNT], int partition_ordering[PARTITION_COUNT])
451 {
452 int i;
453
454 int mscount[256];
455 for (i = 0; i < 256; i++)
456 mscount[i] = 0;
457
458 for (i = 0; i < PARTITION_COUNT; i++)
459 mscount[mismatch_bits[i]]++;
460
461 int summa = 0;
462 for (i = 0; i < 256; i++)
463 {
464 int cnt = mscount[i];
465 mscount[i] = summa;
466 summa += cnt;
467 }
468
469 for (i = 0; i < PARTITION_COUNT; i++)
470 {
471 int idx = mscount[mismatch_bits[i]]++;
472 partition_ordering[idx] = i;
473 }
474 }
475
476
477
478
kmeans_compute_partition_ordering(int xdim,int ydim,int zdim,int partition_count,const imageblock * blk,int * ordering)479 void kmeans_compute_partition_ordering(int xdim, int ydim, int zdim, int partition_count, const imageblock * blk, int *ordering)
480 {
481 int i;
482
483 const block_size_descriptor *bsd = get_block_size_descriptor(xdim, ydim, zdim);
484
485 float4 cluster_centers[4];
486 int partition_of_texel[MAX_TEXELS_PER_BLOCK];
487
488 // 3 passes of plain k-means partitioning
489 for (i = 0; i < 3; i++)
490 {
491 if (i == 0)
492 kpp_initialize(xdim, ydim, zdim, partition_count, blk, cluster_centers);
493 else
494 basic_kmeans_update(xdim, ydim, zdim, partition_count, blk, partition_of_texel, cluster_centers);
495
496 basic_kmeans_assign_pass(xdim, ydim, zdim, partition_count, blk, cluster_centers, partition_of_texel);
497 }
498
499 // at this point, we have a near-ideal partitioning.
500
501 // construct bitmaps
502 uint64_t bitmaps[4];
503 for (i = 0; i < 4; i++)
504 bitmaps[i] = 0ULL;
505
506 int texels_to_process = bsd->texelcount_for_bitmap_partitioning;
507 for (i = 0; i < texels_to_process; i++)
508 {
509 int idx = bsd->texels_for_bitmap_partitioning[i];
510 bitmaps[partition_of_texel[idx]] |= 1ULL << i;
511 }
512
513 int bitcounts[PARTITION_COUNT];
514 // for each entry in the partition table, count bits of partition-mismatch.
515 count_partition_mismatch_bits(xdim, ydim, zdim, partition_count, bitmaps, bitcounts);
516
517 // finally, sort the partitions by bits-of-partition-mismatch
518 get_partition_ordering_by_mismatch_bits(bitcounts, ordering);
519
520 }
521