1 /*----------------------------------------------------------------------------*/
2 /**
3  *	This confidential and proprietary software may be used only as
4  *	authorised by a licensing agreement from ARM Limited
5  *	(C) COPYRIGHT 2011-2012 ARM Limited
6  *	ALL RIGHTS RESERVED
7  *
8  *	The entire notice above must be reproduced on all authorised
9  *	copies and copies may only be made to the extent permitted
10  *	by a licensing agreement from ARM Limited.
11  *
12  *	@brief	approximate k-means cluster partitioning. Do this in 2 stages
13  *
14  *			1: basic clustering, a couple of passes just to get a few clusters
15  *			2: clustering based on line, a few passes until it seems to
16  *			   stabilize.
17  *
18  *			After clustering is done, we use the clustering result to construct
19  *			one bitmap for each partition. We then scan though the partition table,
20  *			counting how well the bitmaps matched.
21  */
22 /*----------------------------------------------------------------------------*/
23 
24 #include "astc_codec_internals.h"
25 
26 // for k++ means, we need pseudo-random numbers, however using random numbers directly
27 // results in irreproducible encoding results. As such, we will instead
28 // just supply a handful of numbers from random.org, and apply an algorithm similar
29 // to XKCD #221. (http://xkcd.com/221/)
30 // cluster the texels using the k++ means clustering initialization algorithm.
31 
kpp_initialize(int xdim,int ydim,int zdim,int partition_count,const imageblock * blk,float4 * cluster_centers)32 void kpp_initialize(int xdim, int ydim, int zdim, int partition_count, const imageblock * blk, float4 * cluster_centers)
33 {
34 	int i;
35 
36 	int texels_per_block = xdim * ydim * zdim;
37 
38 	int cluster_center_samples[4];
39 	// pick a random sample as first center-point.
40 	cluster_center_samples[0] = 145897 /* number from random.org */  % texels_per_block;
41 	int samples_selected = 1;
42 
43 	float distances[MAX_TEXELS_PER_BLOCK];
44 
45 	// compute the distance to the first point.
46 	int sample = cluster_center_samples[0];
47 	float4 center_color = float4(blk->work_data[4 * sample],
48 								 blk->work_data[4 * sample + 1],
49 								 blk->work_data[4 * sample + 2],
50 								 blk->work_data[4 * sample + 3]);
51 
52 	float distance_sum = 0.0f;
53 	for (i = 0; i < texels_per_block; i++)
54 	{
55 		float4 color = float4(blk->work_data[4 * i],
56 							  blk->work_data[4 * i + 1],
57 							  blk->work_data[4 * i + 2],
58 							  blk->work_data[4 * i + 3]);
59 		float4 diff = color - center_color;
60 		float distance = dot(diff, diff);
61 		distance_sum += distance;
62 		distances[i] = distance;
63 	}
64 
65 	// more numbers from random.org
66 	float cluster_cutoffs[25] = {
67 		0.952312f, 0.206893f, 0.835984f, 0.507813f, 0.466170f,
68 		0.872331f, 0.488028f, 0.866394f, 0.363093f, 0.467905f,
69 		0.812967f, 0.626220f, 0.932770f, 0.275454f, 0.832020f,
70 		0.362217f, 0.318558f, 0.240113f, 0.009190f, 0.983995f,
71 		0.566812f, 0.347661f, 0.731960f, 0.156391f, 0.297786f
72 	};
73 
74 	while (1)
75 	{
76 		// pick a point in a weighted-random fashion.
77 		float summa = 0.0f;
78 		float distance_cutoff = distance_sum * cluster_cutoffs[samples_selected + 5 * partition_count];
79 		for (i = 0; i < texels_per_block; i++)
80 		{
81 			summa += distances[i];
82 			if (summa >= distance_cutoff)
83 				break;
84 		}
85 		sample = i;
86 		if (sample >= texels_per_block)
87 			sample = texels_per_block - 1;
88 
89 
90 		cluster_center_samples[samples_selected] = sample;
91 		samples_selected++;
92 		if (samples_selected >= partition_count)
93 			break;
94 
95 		// update the distances with the new point.
96 		center_color = float4(blk->work_data[4 * sample], blk->work_data[4 * sample + 1], blk->work_data[4 * sample + 2], blk->work_data[4 * sample + 3]);
97 
98 		distance_sum = 0.0f;
99 		for (i = 0; i < texels_per_block; i++)
100 		{
101 			float4 color = float4(blk->work_data[4 * i],
102 								  blk->work_data[4 * i + 1],
103 								  blk->work_data[4 * i + 2],
104 								  blk->work_data[4 * i + 3]);
105 			float4 diff = color - center_color;
106 			float distance = dot(diff, diff);
107 			distance = MIN(distance, distances[i]);
108 			distance_sum += distance;
109 			distances[i] = distance;
110 		}
111 	}
112 
113 	// finally, gather up the results.
114 	for (i = 0; i < partition_count; i++)
115 	{
116 		int sample = cluster_center_samples[i];
117 		float4 color = float4(blk->work_data[4 * sample],
118 							  blk->work_data[4 * sample + 1],
119 							  blk->work_data[4 * sample + 2],
120 							  blk->work_data[4 * sample + 3]);
121 		cluster_centers[i] = color;
122 	}
123 }
124 
125 
126 // basic K-means clustering: given a set of cluster centers,
127 // assign each texel to a partition
basic_kmeans_assign_pass(int xdim,int ydim,int zdim,int partition_count,const imageblock * blk,const float4 * cluster_centers,int * partition_of_texel)128 void basic_kmeans_assign_pass(int xdim, int ydim, int zdim, int partition_count, const imageblock * blk, const float4 * cluster_centers, int *partition_of_texel)
129 {
130 	int i, j;
131 
132 	int texels_per_block = xdim * ydim * zdim;
133 
134 	float distances[MAX_TEXELS_PER_BLOCK];
135 	float4 center_color = cluster_centers[0];
136 
137 	int texels_per_partition[4];
138 
139 	texels_per_partition[0] = texels_per_block;
140 	for (i = 1; i < partition_count; i++)
141 		texels_per_partition[i] = 0;
142 
143 
144 	for (i = 0; i < texels_per_block; i++)
145 	{
146 		float4 color = float4(blk->work_data[4 * i],
147 							  blk->work_data[4 * i + 1],
148 							  blk->work_data[4 * i + 2],
149 							  blk->work_data[4 * i + 3]);
150 		float4 diff = color - center_color;
151 		float distance = dot(diff, diff);
152 		distances[i] = distance;
153 		partition_of_texel[i] = 0;
154 	}
155 
156 
157 
158 	for (j = 1; j < partition_count; j++)
159 	{
160 		float4 center_color = cluster_centers[j];
161 
162 		for (i = 0; i < texels_per_block; i++)
163 		{
164 			float4 color = float4(blk->work_data[4 * i],
165 								  blk->work_data[4 * i + 1],
166 								  blk->work_data[4 * i + 2],
167 								  blk->work_data[4 * i + 3]);
168 			float4 diff = color - center_color;
169 			float distance = dot(diff, diff);
170 			if (distance < distances[i])
171 			{
172 				distances[i] = distance;
173 				texels_per_partition[partition_of_texel[i]]--;
174 				texels_per_partition[j]++;
175 				partition_of_texel[i] = j;
176 			}
177 		}
178 	}
179 
180 	// it is possible to get a situation where one of the partitions ends up
181 	// without any texels. In this case, we assign texel N to partition N;
182 	// this is silly, but ensures that every partition retains at least one texel.
183 	// Reassigning a texel in this manner may cause another partition to go empty,
184 	// so if we actually did a reassignment, we run the whole loop over again.
185 	int problem_case;
186 	do
187 	{
188 		problem_case = 0;
189 		for (i = 0; i < partition_count; i++)
190 		{
191 			if (texels_per_partition[i] == 0)
192 			{
193 				texels_per_partition[partition_of_texel[i]]--;
194 				texels_per_partition[i]++;
195 				partition_of_texel[i] = i;
196 				problem_case = 1;
197 			}
198 		}
199 	}
200 	while (problem_case != 0);
201 
202 }
203 
204 
205 // basic k-means clustering: given a set of cluster assignments
206 // for the texels, find the center position of each cluster.
basic_kmeans_update(int xdim,int ydim,int zdim,int partition_count,const imageblock * blk,const int * partition_of_texel,float4 * cluster_centers)207 void basic_kmeans_update(int xdim, int ydim, int zdim, int partition_count, const imageblock * blk, const int *partition_of_texel, float4 * cluster_centers)
208 {
209 	int i;
210 
211 	int texels_per_block = xdim * ydim * zdim;
212 
213 	float4 color_sum[4];
214 	int weight_sum[4];
215 
216 	for (i = 0; i < partition_count; i++)
217 	{
218 		color_sum[i] = float4(0, 0, 0, 0);
219 		weight_sum[i] = 0;
220 	}
221 
222 
223 	// first, find the center-of-gravity in each cluster
224 	for (i = 0; i < texels_per_block; i++)
225 	{
226 		float4 color = float4(blk->work_data[4 * i],
227 							  blk->work_data[4 * i + 1],
228 							  blk->work_data[4 * i + 2],
229 							  blk->work_data[4 * i + 3]);
230 		int part = partition_of_texel[i];
231 		color_sum[part] = color_sum[part] + color;
232 		weight_sum[part]++;
233 	}
234 
235 	for (i = 0; i < partition_count; i++)
236 	{
237 		cluster_centers[i] = color_sum[i] * (1.0f / weight_sum[i]);
238 	}
239 }
240 
241 
242 
243 
244 // after a few rounds of k-means-clustering, we should have a set of 2, 3 or 4 partitions;
245 // we then turn this set into 2, 3 or 4 bitmaps. Then, for each of the 1024 partitions,
246 // we try to match the bitmaps as well as possible.
247 
248 
249 
250 
bitcount(uint64_t p)251 static inline int bitcount(uint64_t p)
252 {
253 	if (sizeof(void *) > 4)
254 	{
255 		uint64_t mask1 = 0x5555555555555555ULL;
256 		uint64_t mask2 = 0x3333333333333333ULL;
257 		uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL;
258 		// best-known algorithm for 64-bit bitcount, assuming 64-bit processor
259 		// should probably be adapted for use with 32-bit processors and/or processors
260 		// with a POPCNT instruction, but leave that for later.
261 		p -= (p >> 1) & mask1;
262 		p = (p & mask2) + ((p >> 2) & mask2);
263 		p += p >> 4;
264 		p &= mask3;
265 		p *= 0x0101010101010101ULL;
266 		p >>= 56;
267 		return (int)p;
268 	}
269 	else
270 	{
271 		// on 32-bit processor, split the 64-bit input argument in two,
272 		// and bitcount each half separately.
273 		uint32_t p1 = (uint32_t) p;
274 		uint32_t p2 = (uint32_t) (p >> 32);
275 		uint32_t mask1 = 0x55555555U;
276 		uint32_t mask2 = 0x33333333U;
277 		uint32_t mask3 = 0x0F0F0F0FU;
278 		p1 = p1 - ((p1 >> 1) & mask1);
279 		p2 = p2 - ((p2 >> 1) & mask1);
280 		p1 = (p1 & mask2) + ((p1 >> 2) & mask2);
281 		p2 = (p2 & mask2) + ((p2 >> 2) & mask2);
282 		p1 += p1 >> 4;
283 		p2 += p2 >> 4;
284 		p1 &= mask3;
285 		p2 &= mask3;
286 		p1 += p2;
287 		p1 *= 0x01010101U;
288 		p1 >>= 24;
289 		return (int)p1;
290 	}
291 }
292 
293 
294 // compute the bit-mismatch for a partitioning in 2-partition mode
partition_mismatch2(uint64_t a0,uint64_t a1,uint64_t b0,uint64_t b1)295 static inline int partition_mismatch2(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
296 {
297 	int v1 = bitcount(a0 ^ b0) + bitcount(a1 ^ b1);
298 	int v2 = bitcount(a0 ^ b1) + bitcount(a1 ^ b0);
299 	return MIN(v1, v2);
300 }
301 
302 
303 // compute the bit-mismatch for a partitioning in 3-partition mode
partition_mismatch3(uint64_t a0,uint64_t a1,uint64_t a2,uint64_t b0,uint64_t b1,uint64_t b2)304 static inline int partition_mismatch3(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t b0, uint64_t b1, uint64_t b2)
305 {
306 	int p00 = bitcount(a0 ^ b0);
307 	int p01 = bitcount(a0 ^ b1);
308 	int p02 = bitcount(a0 ^ b2);
309 
310 	int p10 = bitcount(a1 ^ b0);
311 	int p11 = bitcount(a1 ^ b1);
312 	int p12 = bitcount(a1 ^ b2);
313 
314 	int p20 = bitcount(a2 ^ b0);
315 	int p21 = bitcount(a2 ^ b1);
316 	int p22 = bitcount(a2 ^ b2);
317 
318 	int s0 = p11 + p22;
319 	int s1 = p12 + p21;
320 	int v0 = MIN(s0, s1) + p00;
321 
322 	int s2 = p10 + p22;
323 	int s3 = p12 + p20;
324 	int v1 = MIN(s2, s3) + p01;
325 
326 	int s4 = p10 + p21;
327 	int s5 = p11 + p20;
328 	int v2 = MIN(s4, s5) + p02;
329 
330 	if (v1 < v0)
331 		v0 = v1;
332 	if (v2 < v0)
333 		v0 = v2;
334 
335 	// 9 add, 5 MIN
336 
337 	return v0;
338 }
339 
MIN3(int a,int b,int c)340 static inline int MIN3(int a, int b, int c)
341 {
342 	int d = MIN(a, b);
343 	return MIN(c, d);
344 }
345 
346 // compute the bit-mismatch for a partitioning in 4-partition mode
partition_mismatch4(uint64_t a0,uint64_t a1,uint64_t a2,uint64_t a3,uint64_t b0,uint64_t b1,uint64_t b2,uint64_t b3)347 static inline int partition_mismatch4(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3, uint64_t b0, uint64_t b1, uint64_t b2, uint64_t b3)
348 {
349 	int p00 = bitcount(a0 ^ b0);
350 	int p01 = bitcount(a0 ^ b1);
351 	int p02 = bitcount(a0 ^ b2);
352 	int p03 = bitcount(a0 ^ b3);
353 
354 	int p10 = bitcount(a1 ^ b0);
355 	int p11 = bitcount(a1 ^ b1);
356 	int p12 = bitcount(a1 ^ b2);
357 	int p13 = bitcount(a1 ^ b3);
358 
359 	int p20 = bitcount(a2 ^ b0);
360 	int p21 = bitcount(a2 ^ b1);
361 	int p22 = bitcount(a2 ^ b2);
362 	int p23 = bitcount(a2 ^ b3);
363 
364 	int p30 = bitcount(a3 ^ b0);
365 	int p31 = bitcount(a3 ^ b1);
366 	int p32 = bitcount(a3 ^ b2);
367 	int p33 = bitcount(a3 ^ b3);
368 
369 	int mx23 = MIN(p22 + p33, p23 + p32);
370 	int mx13 = MIN(p21 + p33, p23 + p31);
371 	int mx12 = MIN(p21 + p32, p22 + p31);
372 	int mx03 = MIN(p20 + p33, p23 + p30);
373 	int mx02 = MIN(p20 + p32, p22 + p30);
374 	int mx01 = MIN(p21 + p30, p20 + p31);
375 
376 	int v0 = p00 + MIN3(p11 + mx23, p12 + mx13, p13 + mx12);
377 	int v1 = p01 + MIN3(p10 + mx23, p12 + mx03, p13 + mx02);
378 	int v2 = p02 + MIN3(p11 + mx03, p10 + mx13, p13 + mx01);
379 	int v3 = p03 + MIN3(p11 + mx02, p12 + mx01, p10 + mx12);
380 
381 	int x0 = MIN(v0, v1);
382 	int x1 = MIN(v2, v3);
383 	return MIN(x0, x1);
384 
385 	// 16 bitcount, 17 MIN, 28 ADD
386 }
387 
388 
389 
count_partition_mismatch_bits(int xdim,int ydim,int zdim,int partition_count,const uint64_t bitmaps[4],int bitcounts[PARTITION_COUNT])390 void count_partition_mismatch_bits(int xdim, int ydim, int zdim, int partition_count, const uint64_t bitmaps[4], int bitcounts[PARTITION_COUNT])
391 {
392 	int i;
393 	const partition_info *pi = get_partition_table(xdim, ydim, zdim, partition_count);
394 
395 	if (partition_count == 2)
396 	{
397 		uint64_t bm0 = bitmaps[0];
398 		uint64_t bm1 = bitmaps[1];
399 		for (i = 0; i < PARTITION_COUNT; i++)
400 		{
401 			if (pi->partition_count == 2)
402 			{
403 				bitcounts[i] = partition_mismatch2(bm0, bm1, pi->coverage_bitmaps[0], pi->coverage_bitmaps[1]);
404 			}
405 			else
406 				bitcounts[i] = 255;
407 			pi++;
408 		}
409 	}
410 	else if (partition_count == 3)
411 	{
412 		uint64_t bm0 = bitmaps[0];
413 		uint64_t bm1 = bitmaps[1];
414 		uint64_t bm2 = bitmaps[2];
415 		for (i = 0; i < PARTITION_COUNT; i++)
416 		{
417 			if (pi->partition_count == 3)
418 			{
419 				bitcounts[i] = partition_mismatch3(bm0, bm1, bm2, pi->coverage_bitmaps[0], pi->coverage_bitmaps[1], pi->coverage_bitmaps[2]);
420 			}
421 			else
422 				bitcounts[i] = 255;
423 			pi++;
424 		}
425 	}
426 	else if (partition_count == 4)
427 	{
428 		uint64_t bm0 = bitmaps[0];
429 		uint64_t bm1 = bitmaps[1];
430 		uint64_t bm2 = bitmaps[2];
431 		uint64_t bm3 = bitmaps[3];
432 		for (i = 0; i < PARTITION_COUNT; i++)
433 		{
434 			if (pi->partition_count == 4)
435 			{
436 				bitcounts[i] = partition_mismatch4(bm0, bm1, bm2, bm3, pi->coverage_bitmaps[0], pi->coverage_bitmaps[1], pi->coverage_bitmaps[2], pi->coverage_bitmaps[3]);
437 			}
438 			else
439 				bitcounts[i] = 255;
440 			pi++;
441 		}
442 	}
443 
444 }
445 
446 
447 // counting-sort on the mismatch-bits, thereby
448 // sorting the partitions into an ordering.
449 
get_partition_ordering_by_mismatch_bits(const int mismatch_bits[PARTITION_COUNT],int partition_ordering[PARTITION_COUNT])450 void get_partition_ordering_by_mismatch_bits(const int mismatch_bits[PARTITION_COUNT], int partition_ordering[PARTITION_COUNT])
451 {
452 	int i;
453 
454 	int mscount[256];
455 	for (i = 0; i < 256; i++)
456 		mscount[i] = 0;
457 
458 	for (i = 0; i < PARTITION_COUNT; i++)
459 		mscount[mismatch_bits[i]]++;
460 
461 	int summa = 0;
462 	for (i = 0; i < 256; i++)
463 	{
464 		int cnt = mscount[i];
465 		mscount[i] = summa;
466 		summa += cnt;
467 	}
468 
469 	for (i = 0; i < PARTITION_COUNT; i++)
470 	{
471 		int idx = mscount[mismatch_bits[i]]++;
472 		partition_ordering[idx] = i;
473 	}
474 }
475 
476 
477 
478 
kmeans_compute_partition_ordering(int xdim,int ydim,int zdim,int partition_count,const imageblock * blk,int * ordering)479 void kmeans_compute_partition_ordering(int xdim, int ydim, int zdim, int partition_count, const imageblock * blk, int *ordering)
480 {
481 	int i;
482 
483 	const block_size_descriptor *bsd = get_block_size_descriptor(xdim, ydim, zdim);
484 
485 	float4 cluster_centers[4];
486 	int partition_of_texel[MAX_TEXELS_PER_BLOCK];
487 
488 	// 3 passes of plain k-means partitioning
489 	for (i = 0; i < 3; i++)
490 	{
491 		if (i == 0)
492 			kpp_initialize(xdim, ydim, zdim, partition_count, blk, cluster_centers);
493 		else
494 			basic_kmeans_update(xdim, ydim, zdim, partition_count, blk, partition_of_texel, cluster_centers);
495 
496 		basic_kmeans_assign_pass(xdim, ydim, zdim, partition_count, blk, cluster_centers, partition_of_texel);
497 	}
498 
499 	// at this point, we have a near-ideal partitioning.
500 
501 	// construct bitmaps
502 	uint64_t bitmaps[4];
503 	for (i = 0; i < 4; i++)
504 		bitmaps[i] = 0ULL;
505 
506 	int texels_to_process = bsd->texelcount_for_bitmap_partitioning;
507 	for (i = 0; i < texels_to_process; i++)
508 	{
509 		int idx = bsd->texels_for_bitmap_partitioning[i];
510 		bitmaps[partition_of_texel[idx]] |= 1ULL << i;
511 	}
512 
513 	int bitcounts[PARTITION_COUNT];
514 	// for each entry in the partition table, count bits of partition-mismatch.
515 	count_partition_mismatch_bits(xdim, ydim, zdim, partition_count, bitmaps, bitcounts);
516 
517 	// finally, sort the partitions by bits-of-partition-mismatch
518 	get_partition_ordering_by_mismatch_bits(bitcounts, ordering);
519 
520 }
521