1 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2 /**
3 * Contains source code from the article "Radix Sort Revisited".
4 * \file Radix.cpp
5 * \author Pierre Terdiman
6 * \date April, 4, 2000
7 */
8 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
9
10 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
11 /**
12 * Revisited Radix Sort.
13 * This is my new radix routine:
14 * - it uses indices and doesn't recopy the values anymore, hence wasting less ram
15 * - it creates all the histograms in one run instead of four
16 * - it sorts words faster than dwords and bytes faster than words
17 * - it correctly sorts negative floating-point values by patching the offsets
18 * - it automatically takes advantage of temporal coherence
19 * - multiple keys support is a side effect of temporal coherence
20 * - it may be worth recoding in asm... (mainly to use FCOMI, FCMOV, etc) [it's probably memory-bound anyway]
21 *
22 * History:
23 * - 08.15.98: very first version
24 * - 04.04.00: recoded for the radix article
25 * - 12.xx.00: code lifting
26 * - 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here)
27 * - 10.11.01: added local ram support
28 * - 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting......
29 *
30 * \class RadixSort
31 * \author Pierre Terdiman
32 * \version 1.3
33 * \date August, 15, 1998
34 */
35 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
36
37 /*
38 To do:
39 - add an offset parameter between two input values (avoid some data recopy sometimes)
40 - unroll ? asm ?
41 */
42
43 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
44 // Header
45
46 #include <nvcore/Radix.h>
47
48 #include <string.h> // memset
49
50 //using namespace IceCore;
51
52 #define DELETEARRAY(a) { delete [] a; a = NULL; }
53 #define CHECKALLOC(a)
54
55
56
57 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
58 /**
59 * Constructor.
60 */
61 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
RadixSort()62 RadixSort::RadixSort() : mCurrentSize(0), mPreviousSize(0), mIndices(NULL), mIndices2(NULL), mTotalCalls(0), mNbHits(0)
63 {
64 #ifndef RADIX_LOCAL_RAM
65 // Allocate input-independent ram
66 mHistogram = new uint32[256*4];
67 mOffset = new uint32[256];
68 #endif
69 // Initialize indices
70 resetIndices();
71 }
72
73 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
74 /**
75 * Destructor.
76 */
77 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
~RadixSort()78 RadixSort::~RadixSort()
79 {
80 // Release everything
81 #ifndef RADIX_LOCAL_RAM
82 DELETEARRAY(mOffset);
83 DELETEARRAY(mHistogram);
84 #endif
85 DELETEARRAY(mIndices2);
86 DELETEARRAY(mIndices);
87 }
88
89 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
90 /**
91 * Resizes the inner lists.
92 * \param nb [in] new size (number of dwords)
93 * \return true if success
94 */
95 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
resize(uint32 nb)96 bool RadixSort::resize(uint32 nb)
97 {
98 // Free previously used ram
99 DELETEARRAY(mIndices2);
100 DELETEARRAY(mIndices);
101
102 // Get some fresh one
103 mIndices = new uint32[nb]; CHECKALLOC(mIndices);
104 mIndices2 = new uint32[nb]; CHECKALLOC(mIndices2);
105 mCurrentSize = nb;
106
107 // Initialize indices so that the input buffer is read in sequential order
108 resetIndices();
109
110 return true;
111 }
112
113 #define CHECK_RESIZE(n) \
114 if(n!=mPreviousSize) \
115 { \
116 if(n>mCurrentSize) resize(n); \
117 else resetIndices(); \
118 mPreviousSize = n; \
119 }
120
121 #define CREATE_HISTOGRAMS(type, buffer) \
122 /* Clear counters */ \
123 memset(mHistogram, 0, 256*4*sizeof(uint32)); \
124 \
125 /* Prepare for temporal coherence */ \
126 type PrevVal = (type)buffer[mIndices[0]]; \
127 bool AlreadySorted = true; /* Optimism... */ \
128 uint32* Indices = mIndices; \
129 \
130 /* Prepare to count */ \
131 uint8* p = (uint8*)input; \
132 uint8* pe = &p[nb*4]; \
133 uint32* h0= &mHistogram[0]; /* Histogram for first pass (LSB) */ \
134 uint32* h1= &mHistogram[256]; /* Histogram for second pass */ \
135 uint32* h2= &mHistogram[512]; /* Histogram for third pass */ \
136 uint32* h3= &mHistogram[768]; /* Histogram for last pass (MSB) */ \
137 \
138 while(p!=pe) \
139 { \
140 /* Read input buffer in previous sorted order */ \
141 type Val = (type)buffer[*Indices++]; \
142 /* Check whether already sorted or not */ \
143 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
144 /* Update for next iteration */ \
145 PrevVal = Val; \
146 \
147 /* Create histograms */ \
148 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
149 } \
150 \
151 /* If all input values are already sorted, we just have to return and leave the */ \
152 /* previous list unchanged. That way the routine may take advantage of temporal */ \
153 /* coherence, for example when used to sort transparent faces. */ \
154 if(AlreadySorted) { mNbHits++; return *this; } \
155 \
156 /* Else there has been an early out and we must finish computing the histograms */ \
157 while(p!=pe) \
158 { \
159 /* Create histograms without the previous overhead */ \
160 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
161 }
162
163 #define CHECK_PASS_VALIDITY(pass) \
164 /* Shortcut to current counters */ \
165 uint32* CurCount = &mHistogram[pass<<8]; \
166 \
167 /* Reset flag. The sorting pass is supposed to be performed. (default) */ \
168 bool PerformPass = true; \
169 \
170 /* Check pass validity */ \
171 \
172 /* If all values have the same byte, sorting is useless. */ \
173 /* It may happen when sorting bytes or words instead of dwords. */ \
174 /* This routine actually sorts words faster than dwords, and bytes */ \
175 /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */ \
176 /* for words and O(n) for bytes. Running time for floats depends on actual values... */ \
177 \
178 /* Get first byte */ \
179 uint8 UniqueVal = *(((uint8*)input)+pass); \
180 \
181 /* Check that byte's counter */ \
182 if(CurCount[UniqueVal]==nb) PerformPass=false;
183
184
185 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
186 /**
187 * Main sort routine.
188 * This one is for integer values. After the call, mIndices contains a list of indices in sorted order, i.e. in the order you may process your data.
189 * \param input [in] a list of integer values to sort
190 * \param nb [in] number of values to sort
191 * \param signedvalues [in] true to handle negative values, false if you know your input buffer only contains positive values
192 * \return Self-Reference
193 */
194 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
sort(const uint32 * input,uint32 nb,bool signedvalues)195 RadixSort& RadixSort::sort(const uint32* input, uint32 nb, bool signedvalues)
196 {
197 uint32 i, j;
198
199 // Checkings
200 if(!input || !nb) return *this;
201
202 // Stats
203 mTotalCalls++;
204
205 // Resize lists if needed
206 CHECK_RESIZE(nb);
207
208 #ifdef RADIX_LOCAL_RAM
209 // Allocate histograms & offsets on the stack
210 uint32 mHistogram[256*4];
211 uint32 mOffset[256];
212 #endif
213
214 // Create histograms (counters). Counters for all passes are created in one run.
215 // Pros: read input buffer once instead of four times
216 // Cons: mHistogram is 4Kb instead of 1Kb
217 // We must take care of signed/unsigned values for temporal coherence.... I just
218 // have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
219 if(!signedvalues) { CREATE_HISTOGRAMS(uint32, input); }
220 else { CREATE_HISTOGRAMS(int32, input); }
221
222 // Compute #negative values involved if needed
223 uint32 NbNegativeValues = 0;
224 if(signedvalues)
225 {
226 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
227 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
228 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
229 uint32* h3= &mHistogram[768];
230 for( i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
231 }
232
233 // Radix sort, j is the pass number (0=LSB, 3=MSB)
234 for( j=0;j<4;j++)
235 {
236 CHECK_PASS_VALIDITY(j);
237
238 // Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
239 // not a problem, numbers are correctly sorted anyway.
240 if(PerformPass)
241 {
242 // Should we care about negative values?
243 if(j!=3 || !signedvalues)
244 {
245 // Here we deal with positive values only
246
247 // Create offsets
248 mOffset[0] = 0;
249 for(i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
250 }
251 else
252 {
253 // This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
254
255 // Create biased offsets, in order for negative numbers to be sorted as well
256 mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
257 for(i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
258
259 // Fixing the wrong place for negative values
260 mOffset[128] = 0;
261 for(i=129;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
262 }
263
264 // Perform Radix Sort
265 uint8* InputBytes = (uint8*)input;
266 uint32* Indices = mIndices;
267 uint32* IndicesEnd = &mIndices[nb];
268 InputBytes += j;
269 while(Indices!=IndicesEnd)
270 {
271 uint32 id = *Indices++;
272 mIndices2[mOffset[InputBytes[id<<2]]++] = id;
273 }
274
275 // Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
276 uint32* Tmp = mIndices; mIndices = mIndices2; mIndices2 = Tmp;
277 }
278 }
279 return *this;
280 }
281
282 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
283 /**
284 * Main sort routine.
285 * This one is for floating-point values. After the call, mIndices contains a list of indices in sorted order, i.e. in the order you may process your data.
286 * \param input [in] a list of floating-point values to sort
287 * \param nb [in] number of values to sort
288 * \return Self-Reference
289 * \warning only sorts IEEE floating-point values
290 */
291 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
sort(const float * input2,uint32 nb)292 RadixSort& RadixSort::sort(const float* input2, uint32 nb)
293 {
294 uint32 i, j;
295
296 // Checkings
297 if(!input2 || !nb) return *this;
298
299 // Stats
300 mTotalCalls++;
301
302 uint32* input = (uint32*)input2;
303
304 // Resize lists if needed
305 CHECK_RESIZE(nb);
306
307 #ifdef RADIX_LOCAL_RAM
308 // Allocate histograms & offsets on the stack
309 uint32 mHistogram[256*4];
310 uint32 mOffset[256];
311 #endif
312
313 // Create histograms (counters). Counters for all passes are created in one run.
314 // Pros: read input buffer once instead of four times
315 // Cons: mHistogram is 4Kb instead of 1Kb
316 // Floating-point values are always supposed to be signed values, so there's only one code path there.
317 // Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
318 // is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
319 // generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
320 // wouldn't work with mixed positive/negative values....
321 { CREATE_HISTOGRAMS(float, input2); }
322
323 // Compute #negative values involved if needed
324 uint32 NbNegativeValues = 0;
325 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
326 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
327 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
328 uint32* h3= &mHistogram[768];
329 for( i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
330
331 // Radix sort, j is the pass number (0=LSB, 3=MSB)
332 for( j=0;j<4;j++)
333 {
334 // Should we care about negative values?
335 if(j!=3)
336 {
337 // Here we deal with positive values only
338 CHECK_PASS_VALIDITY(j);
339
340 if(PerformPass)
341 {
342 // Create offsets
343 mOffset[0] = 0;
344 for( i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
345
346 // Perform Radix Sort
347 uint8* InputBytes = (uint8*)input;
348 uint32* Indices = mIndices;
349 uint32* IndicesEnd = &mIndices[nb];
350 InputBytes += j;
351 while(Indices!=IndicesEnd)
352 {
353 uint32 id = *Indices++;
354 mIndices2[mOffset[InputBytes[id<<2]]++] = id;
355 }
356
357 // Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
358 uint32* Tmp = mIndices; mIndices = mIndices2; mIndices2 = Tmp;
359 }
360 }
361 else
362 {
363 // This is a special case to correctly handle negative values
364 CHECK_PASS_VALIDITY(j);
365
366 if(PerformPass)
367 {
368 // Create biased offsets, in order for negative numbers to be sorted as well
369 mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
370 for(i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
371
372 // We must reverse the sorting order for negative numbers!
373 mOffset[255] = 0;
374 for(i=0;i<127;i++) mOffset[254-i] = mOffset[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
375 for(i=128;i<256;i++) mOffset[i] += CurCount[i]; // Fixing the wrong place for negative values
376
377 // Perform Radix Sort
378 for(i=0;i<nb;i++)
379 {
380 uint32 Radix = input[mIndices[i]]>>24; // Radix byte, same as above. AND is useless here (uint32).
381 // ### cmp to be killed. Not good. Later.
382 if(Radix<128) mIndices2[mOffset[Radix]++] = mIndices[i]; // Number is positive, same as above
383 else mIndices2[--mOffset[Radix]] = mIndices[i]; // Number is negative, flip the sorting order
384 }
385 // Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
386 uint32* Tmp = mIndices; mIndices = mIndices2; mIndices2 = Tmp;
387 }
388 else
389 {
390 // The pass is useless, yet we still have to reverse the order of current list if all values are negative.
391 if(UniqueVal>=128)
392 {
393 for(i=0;i<nb;i++) mIndices2[i] = mIndices[nb-i-1];
394
395 // Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
396 uint32* Tmp = mIndices; mIndices = mIndices2; mIndices2 = Tmp;
397 }
398 }
399 }
400 }
401 return *this;
402 }
403
404 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
405 /**
406 * Resets the inner indices. After the call, mIndices is reset.
407 */
408 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
resetIndices()409 void RadixSort::resetIndices()
410 {
411 for(uint32 i=0;i<mCurrentSize;i++) mIndices[i] = i;
412 }
413
414 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
415 /**
416 * Gets the ram used.
417 * \return memory used in bytes
418 */
419 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
usedRam() const420 uint32 RadixSort::usedRam() const
421 {
422 uint32 UsedRam = sizeof(RadixSort);
423 #ifndef RADIX_LOCAL_RAM
424 UsedRam += 256*4*sizeof(uint32); // Histograms
425 UsedRam += 256*sizeof(uint32); // Offsets
426 #endif
427 UsedRam += 2*mCurrentSize*sizeof(uint32); // 2 lists of indices
428 return UsedRam;
429 }
430