1 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2 /**
3  *	Contains source code from the article "Radix Sort Revisited".
4  *	\file		Radix.cpp
5  *	\author		Pierre Terdiman
6  *	\date		April, 4, 2000
7  */
8 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
9 
10 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
11 /**
12  *	Revisited Radix Sort.
13  *	This is my new radix routine:
14  *  - it uses indices and doesn't recopy the values anymore, hence wasting less ram
15  *  - it creates all the histograms in one run instead of four
16  *  - it sorts words faster than dwords and bytes faster than words
17  *  - it correctly sorts negative floating-point values by patching the offsets
18  *  - it automatically takes advantage of temporal coherence
19  *  - multiple keys support is a side effect of temporal coherence
20  *  - it may be worth recoding in asm... (mainly to use FCOMI, FCMOV, etc) [it's probably memory-bound anyway]
21  *
22  *	History:
23  *	- 08.15.98: very first version
24  *	- 04.04.00: recoded for the radix article
25  *	- 12.xx.00: code lifting
26  *	- 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here)
27  *	- 10.11.01: added local ram support
28  *	- 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting......
29  *
30  *	\class		RadixSort
31  *	\author		Pierre Terdiman
32  *	\version	1.3
33  *	\date		August, 15, 1998
34  */
35 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
36 
37 /*
38 To do:
39 	- add an offset parameter between two input values (avoid some data recopy sometimes)
40 	- unroll ? asm ?
41 */
42 
43 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
44 // Header
45 
46 #include <nvcore/Radix.h>
47 
48 #include <string.h> // memset
49 
50 //using namespace IceCore;
51 
52 #define DELETEARRAY(a)	{ delete [] a; a = NULL; }
53 #define CHECKALLOC(a)
54 
55 
56 
57 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
58 /**
59  *	Constructor.
60  */
61 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
RadixSort()62 RadixSort::RadixSort() : mCurrentSize(0), mPreviousSize(0), mIndices(NULL), mIndices2(NULL), mTotalCalls(0), mNbHits(0)
63 {
64 #ifndef RADIX_LOCAL_RAM
65 	// Allocate input-independent ram
66 	mHistogram		= new uint32[256*4];
67 	mOffset			= new uint32[256];
68 #endif
69 	// Initialize indices
70 	resetIndices();
71 }
72 
73 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
74 /**
75  *	Destructor.
76  */
77 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
~RadixSort()78 RadixSort::~RadixSort()
79 {
80 	// Release everything
81 #ifndef RADIX_LOCAL_RAM
82 	DELETEARRAY(mOffset);
83 	DELETEARRAY(mHistogram);
84 #endif
85 	DELETEARRAY(mIndices2);
86 	DELETEARRAY(mIndices);
87 }
88 
89 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
90 /**
91  *	Resizes the inner lists.
92  *	\param		nb				[in] new size (number of dwords)
93  *	\return		true if success
94  */
95 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
resize(uint32 nb)96 bool RadixSort::resize(uint32 nb)
97 {
98 	// Free previously used ram
99 	DELETEARRAY(mIndices2);
100 	DELETEARRAY(mIndices);
101 
102 	// Get some fresh one
103 	mIndices		= new uint32[nb];	CHECKALLOC(mIndices);
104 	mIndices2		= new uint32[nb];	CHECKALLOC(mIndices2);
105 	mCurrentSize	= nb;
106 
107 	// Initialize indices so that the input buffer is read in sequential order
108 	resetIndices();
109 
110 	return true;
111 }
112 
113 #define CHECK_RESIZE(n)																			\
114 	if(n!=mPreviousSize)																		\
115 	{																							\
116 				if(n>mCurrentSize)	resize(n);													\
117 		else						resetIndices();												\
118 		mPreviousSize = n;																		\
119 	}
120 
121 #define CREATE_HISTOGRAMS(type, buffer)															\
122 	/* Clear counters */																		\
123 	memset(mHistogram, 0, 256*4*sizeof(uint32));												\
124 																								\
125 	/* Prepare for temporal coherence */														\
126 	type PrevVal = (type)buffer[mIndices[0]];													\
127 	bool AlreadySorted = true;	/* Optimism... */												\
128 	uint32* Indices = mIndices;																	\
129 																								\
130 	/* Prepare to count */																		\
131 	uint8* p = (uint8*)input;																	\
132 	uint8* pe = &p[nb*4];																		\
133 	uint32* h0= &mHistogram[0];		/* Histogram for first pass (LSB)	*/						\
134 	uint32* h1= &mHistogram[256];	/* Histogram for second pass		*/						\
135 	uint32* h2= &mHistogram[512];	/* Histogram for third pass			*/						\
136 	uint32* h3= &mHistogram[768];	/* Histogram for last pass (MSB)	*/						\
137 																								\
138 	while(p!=pe)																				\
139 	{																							\
140 		/* Read input buffer in previous sorted order */										\
141 		type Val = (type)buffer[*Indices++];													\
142 		/* Check whether already sorted or not */												\
143 		if(Val<PrevVal)	{ AlreadySorted = false; break; } /* Early out */						\
144 		/* Update for next iteration */															\
145 		PrevVal = Val;																			\
146 																								\
147 		/* Create histograms */																	\
148 		h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;											\
149 	}																							\
150 																								\
151 	/* If all input values are already sorted, we just have to return and leave the */			\
152 	/* previous list unchanged. That way the routine may take advantage of temporal */			\
153 	/* coherence, for example when used to sort transparent faces.					*/			\
154 	if(AlreadySorted)	{ mNbHits++; return *this;	}											\
155 																								\
156 	/* Else there has been an early out and we must finish computing the histograms */			\
157 	while(p!=pe)																				\
158 	{																							\
159 		/* Create histograms without the previous overhead */									\
160 		h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;											\
161 	}
162 
163 #define CHECK_PASS_VALIDITY(pass)																\
164 	/* Shortcut to current counters */															\
165 	uint32* CurCount = &mHistogram[pass<<8];													\
166 																								\
167 	/* Reset flag. The sorting pass is supposed to be performed. (default) */					\
168 	bool PerformPass = true;																	\
169 																								\
170 	/* Check pass validity */																	\
171 																								\
172 	/* If all values have the same byte, sorting is useless. */									\
173 	/* It may happen when sorting bytes or words instead of dwords. */							\
174 	/* This routine actually sorts words faster than dwords, and bytes */						\
175 	/* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */					\
176 	/* for words and O(n) for bytes. Running time for floats depends on actual values... */		\
177 																								\
178 	/* Get first byte */																		\
179 	uint8 UniqueVal = *(((uint8*)input)+pass);													\
180 																								\
181 	/* Check that byte's counter */																\
182 	if(CurCount[UniqueVal]==nb)	PerformPass=false;
183 
184 
185 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
186 /**
187  *	Main sort routine.
188  *	This one is for integer values. After the call, mIndices contains a list of indices in sorted order, i.e. in the order you may process your data.
189  *	\param		input			[in] a list of integer values to sort
190  *	\param		nb				[in] number of values to sort
191  *	\param		signedvalues	[in] true to handle negative values, false if you know your input buffer only contains positive values
192  *	\return		Self-Reference
193  */
194 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
sort(const uint32 * input,uint32 nb,bool signedvalues)195 RadixSort& RadixSort::sort(const uint32* input, uint32 nb, bool signedvalues)
196 {
197 	uint32 i, j;
198 
199 	// Checkings
200 	if(!input || !nb)	return *this;
201 
202 	// Stats
203 	mTotalCalls++;
204 
205 	// Resize lists if needed
206 	CHECK_RESIZE(nb);
207 
208 #ifdef RADIX_LOCAL_RAM
209 	// Allocate histograms & offsets on the stack
210 	uint32 mHistogram[256*4];
211 	uint32 mOffset[256];
212 #endif
213 
214 	// Create histograms (counters). Counters for all passes are created in one run.
215 	// Pros:	read input buffer once instead of four times
216 	// Cons:	mHistogram is 4Kb instead of 1Kb
217 	// We must take care of signed/unsigned values for temporal coherence.... I just
218 	// have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
219 	if(!signedvalues)	{ CREATE_HISTOGRAMS(uint32, input);	}
220 	else				{ CREATE_HISTOGRAMS(int32, input);	}
221 
222 	// Compute #negative values involved if needed
223 	uint32 NbNegativeValues = 0;
224 	if(signedvalues)
225 	{
226 		// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
227 		// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
228 		// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
229 		uint32* h3= &mHistogram[768];
230 		for( i=128;i<256;i++)	NbNegativeValues += h3[i];	// 768 for last histogram, 128 for negative part
231 	}
232 
233 	// Radix sort, j is the pass number (0=LSB, 3=MSB)
234 	for( j=0;j<4;j++)
235 	{
236 		CHECK_PASS_VALIDITY(j);
237 
238 		// Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
239 		// not a problem, numbers are correctly sorted anyway.
240 		if(PerformPass)
241 		{
242 			// Should we care about negative values?
243 			if(j!=3 || !signedvalues)
244 			{
245 				// Here we deal with positive values only
246 
247 				// Create offsets
248 				mOffset[0] = 0;
249 				for(i=1;i<256;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];
250 			}
251 			else
252 			{
253 				// This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
254 
255 				// Create biased offsets, in order for negative numbers to be sorted as well
256 				mOffset[0] = NbNegativeValues;												// First positive number takes place after the negative ones
257 				for(i=1;i<128;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];	// 1 to 128 for positive numbers
258 
259 				// Fixing the wrong place for negative values
260 				mOffset[128] = 0;
261 				for(i=129;i<256;i++)			mOffset[i] = mOffset[i-1] + CurCount[i-1];
262 			}
263 
264 			// Perform Radix Sort
265 			uint8* InputBytes	= (uint8*)input;
266 			uint32* Indices		= mIndices;
267 			uint32* IndicesEnd	= &mIndices[nb];
268 			InputBytes += j;
269 			while(Indices!=IndicesEnd)
270 			{
271 				uint32 id = *Indices++;
272 				mIndices2[mOffset[InputBytes[id<<2]]++] = id;
273 			}
274 
275 			// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
276 			uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
277 		}
278 	}
279 	return *this;
280 }
281 
282 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
283 /**
284  *	Main sort routine.
285  *	This one is for floating-point values. After the call, mIndices contains a list of indices in sorted order, i.e. in the order you may process your data.
286  *	\param		input			[in] a list of floating-point values to sort
287  *	\param		nb				[in] number of values to sort
288  *	\return		Self-Reference
289  *	\warning	only sorts IEEE floating-point values
290  */
291 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
sort(const float * input2,uint32 nb)292 RadixSort& RadixSort::sort(const float* input2, uint32 nb)
293 {
294 	uint32 i, j;
295 
296 	// Checkings
297 	if(!input2 || !nb)	return *this;
298 
299 	// Stats
300 	mTotalCalls++;
301 
302 	uint32* input = (uint32*)input2;
303 
304 	// Resize lists if needed
305 	CHECK_RESIZE(nb);
306 
307 #ifdef RADIX_LOCAL_RAM
308 	// Allocate histograms & offsets on the stack
309 	uint32 mHistogram[256*4];
310 	uint32 mOffset[256];
311 #endif
312 
313 	// Create histograms (counters). Counters for all passes are created in one run.
314 	// Pros:	read input buffer once instead of four times
315 	// Cons:	mHistogram is 4Kb instead of 1Kb
316 	// Floating-point values are always supposed to be signed values, so there's only one code path there.
317 	// Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
318 	// is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
319 	// generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
320 	// wouldn't work with mixed positive/negative values....
321 	{ CREATE_HISTOGRAMS(float, input2); }
322 
323 	// Compute #negative values involved if needed
324 	uint32 NbNegativeValues = 0;
325 	// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
326 	// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
327 	// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
328 	uint32* h3= &mHistogram[768];
329 	for( i=128;i<256;i++)	NbNegativeValues += h3[i];	// 768 for last histogram, 128 for negative part
330 
331 	// Radix sort, j is the pass number (0=LSB, 3=MSB)
332 	for( j=0;j<4;j++)
333 	{
334 		// Should we care about negative values?
335 		if(j!=3)
336 		{
337 			// Here we deal with positive values only
338 			CHECK_PASS_VALIDITY(j);
339 
340 			if(PerformPass)
341 			{
342 				// Create offsets
343 				mOffset[0] = 0;
344 				for( i=1;i<256;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];
345 
346 				// Perform Radix Sort
347 				uint8* InputBytes	= (uint8*)input;
348 				uint32* Indices		= mIndices;
349 				uint32* IndicesEnd	= &mIndices[nb];
350 				InputBytes += j;
351 				while(Indices!=IndicesEnd)
352 				{
353 					uint32 id = *Indices++;
354 					mIndices2[mOffset[InputBytes[id<<2]]++] = id;
355 				}
356 
357 				// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
358 				uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
359 			}
360 		}
361 		else
362 		{
363 			// This is a special case to correctly handle negative values
364 			CHECK_PASS_VALIDITY(j);
365 
366 			if(PerformPass)
367 			{
368 				// Create biased offsets, in order for negative numbers to be sorted as well
369 				mOffset[0] = NbNegativeValues;												// First positive number takes place after the negative ones
370 				for(i=1;i<128;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];	// 1 to 128 for positive numbers
371 
372 				// We must reverse the sorting order for negative numbers!
373 				mOffset[255] = 0;
374 				for(i=0;i<127;i++)		mOffset[254-i] = mOffset[255-i] + CurCount[255-i];	// Fixing the wrong order for negative values
375 				for(i=128;i<256;i++)	mOffset[i] += CurCount[i];							// Fixing the wrong place for negative values
376 
377 				// Perform Radix Sort
378 				for(i=0;i<nb;i++)
379 				{
380 					uint32 Radix = input[mIndices[i]]>>24;								// Radix byte, same as above. AND is useless here (uint32).
381 					// ### cmp to be killed. Not good. Later.
382 					if(Radix<128)		mIndices2[mOffset[Radix]++] = mIndices[i];		// Number is positive, same as above
383 					else				mIndices2[--mOffset[Radix]] = mIndices[i];		// Number is negative, flip the sorting order
384 				}
385 				// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
386 				uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
387 			}
388 			else
389 			{
390 				// The pass is useless, yet we still have to reverse the order of current list if all values are negative.
391 				if(UniqueVal>=128)
392 				{
393 					for(i=0;i<nb;i++)	mIndices2[i] = mIndices[nb-i-1];
394 
395 					// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
396 					uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
397 				}
398 			}
399 		}
400 	}
401 	return *this;
402 }
403 
404 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
405 /**
406  *	Resets the inner indices. After the call, mIndices is reset.
407  */
408 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
resetIndices()409 void RadixSort::resetIndices()
410 {
411 	for(uint32 i=0;i<mCurrentSize;i++)	mIndices[i] = i;
412 }
413 
414 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
415 /**
416  *	Gets the ram used.
417  *	\return		memory used in bytes
418  */
419 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
usedRam() const420 uint32 RadixSort::usedRam() const
421 {
422 	uint32 UsedRam = sizeof(RadixSort);
423 #ifndef RADIX_LOCAL_RAM
424 	UsedRam += 256*4*sizeof(uint32);			// Histograms
425 	UsedRam += 256*sizeof(uint32);				// Offsets
426 #endif
427 	UsedRam += 2*mCurrentSize*sizeof(uint32);	// 2 lists of indices
428 	return UsedRam;
429 }
430