1 /***************************************************************************
2 sorting.cpp - all things sortlike
3 -------------------
4 begin : Nove 2018
5 copyright : (C) 2018 by G. Duvert and Others
6 email : gilles dot duvert at free dot fr
7 ***************************************************************************/
8
9 /***************************************************************************
10 * *
11 * This program is free software; you can redistribute it and/or modify *
12 * it under the terms of the GNU General Public License as published by *
13 * the Free Software Foundation; either version 2 of the License, or *
14 * (at your option) any later version. *
15 * *
16 ***************************************************************************/
17
18 #include "includefirst.hpp"
19
20 #include "datatypes.hpp"
21 #include "envt.hpp"
22 #include "dinterpreter.hpp"
23
24 namespace lib {
25 #define INSERTION_SORT_THRESHOLD 256 //after, merge is better (floats)
26 #define QUICK_SORT_THRESHOLD 0 //never better
27 #define RADIX_SORT_THRESHOLD_ADAPT 2000000 //for Adaptive value, non floats
28 #define RADIX_SORT_THRESHOLD_ADAPT_FLOAT 600000 //NOT USED for Adaptive value, floats (but we do not use Radix or floats in adaptive mode due to the -NaN feature)
29 #define RADIX_SORT_THRESHOLD_FOR_FLOAT 30000000 //NOT USED radix vs. idl (but we do not use Radix for floats due to the -NaN feature)
30 #define RADIX_SORT_THRESHOLD_FOR_DOUBLE 6000000 //idem
31 //radix is better than anything from 0 to this value for all integer types
32 #define RADIX_SORT_THRESHOLD_FOR_LONG 3000000
33 #define RADIX_SORT_THRESHOLD_FOR_ULONG 3000000
34 #define RADIX_SORT_THRESHOLD_FOR_LONG64 2000000
35 #define RADIX_SORT_THRESHOLD_FOR_ULONG64 1000000
36 #define MERGESORT_PARALLEL_THRESHOLD 1000000
37
38 // The following GDL version of SORT() is a complete rewriting using modern methods for a faster sort.
39 // It provides a noticeable, sometimes even huge, speed gain, depending on array sizes and types.
40 // This version uses
41 // 1) modified code from https://github.com/Pierre-Terdiman/RadixRedux
42 // (adapted to GDL interfaces and other types than floats and longs) plus
43 // 2) new versions of other sorting methods, as the various codes found on the web do not sort indexes
44 // but values and do not sort NaNs at all.
45 // I've added a GDL_SORT() "private" function that permits to use one of: /RADIX, /QUICK, /MERGE and /INSERT
46 // sorting method. This function is used in the procedure testsuite/benchmark/compare_sort_algos
47 // to plot the performances of all the different sorts (insertion is not used as it is in N^2 and would take forever)
48 // and adjust the combination of all these methods that makes for the performance in the final sort().
49 // With a modicum of work, this could be used to fine-tune the SORT() method for a particular machine or purpose,
50 // maintaining top performances in specific cases (clusters etc).
51 //------------------Radix Sorting Codes ------------------------------------------------------------------------------
52 // Here is the copyright for the Radix code by PT. It is understood that Pierre's code was for Floats, Longs and ULongs only.
53 //Copyright (c) 2018 Pierre Terdiman - http://www.codercorner.com/blog
54 //
55 //
56 //This software is provided 'as-is', without any express or implied warranty.
57 //In no event will the authors be held liable for any damages arising from the use of this software.
58 //Permission is granted to anyone to use this software for any purpose, including commercial applications,
59 //and to alter it and redistribute it freely, subject to the following restrictions:
60 //1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software.
61 //If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
62 //2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
63 //3. This notice may not be removed or altered from any source distribution.
64 //------------------Radix Sorting Codes ------------------------------------------------------------------------------
65 //
66 #if defined(IS_BIGENDIAN)
67 #define H0_OFFSET2 256
68 #define H1_OFFSET2 0
69 #define BYTES_INC2 (1-j)
70 #define H0_OFFSET4 768
71 #define H1_OFFSET4 512
72 #define H2_OFFSET4 256
73 #define H3_OFFSET4 0
74 #define BYTES_INC4 (3-j)
75 #define H7_OFFSET8 0
76 #define H6_OFFSET8 256
77 #define H5_OFFSET8 512
78 #define H4_OFFSET8 768
79 #define H3_OFFSET8 1024
80 #define H2_OFFSET8 1280
81 #define H1_OFFSET8 1536
82 #define H0_OFFSET8 1792
83 #define BYTES_INC8 (7-j)
84 #else
85 #define H0_OFFSET2 0
86 #define H1_OFFSET2 256
87 #define BYTES_INC2 j
88 #define H0_OFFSET4 0
89 #define H1_OFFSET4 256
90 #define H2_OFFSET4 512
91 #define H3_OFFSET4 768
92 #define BYTES_INC4 j
93 #define H0_OFFSET8 0
94 #define H1_OFFSET8 256
95 #define H2_OFFSET8 512
96 #define H3_OFFSET8 768
97 #define H4_OFFSET8 1024
98 #define H5_OFFSET8 1280
99 #define H6_OFFSET8 1536
100 #define H7_OFFSET8 1792
101 #define BYTES_INC8 j
102 #endif
103
104 #include <string.h> //for memset (Windows)
105
106 #define CREATE_HISTOGRAMS2(type, buffer) \
107 /* Clear counters/histograms */ \
108 memset(Histogram, 0, 256*2*sizeof(T)); \
109 \
110 /* Prepare to count */ \
111 const DByte* p = (const DByte*)input; \
112 const DByte* pe = &p[nb*2]; \
113 T* h0 = &Histogram[H0_OFFSET2]; /* Histogram for first pass (LSB) */ \
114 T* h1 = &Histogram[H1_OFFSET2]; /* Histogram for last pass (MSB) */ \
115 \
116 bool AlreadySorted = true; /* Optimism... */ \
117 \
118 if(ranksUnInitialized) \
119 { \
120 /* Prepare for temporal coherence */ \
121 const type* Running = (type*)buffer; \
122 type PrevVal = *Running; \
123 \
124 while(p!=pe) \
125 { \
126 /* Read input buffer in previous sorted order */ \
127 const type Val = *Running++; \
128 /* Check whether already sorted or not */ \
129 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
130 /* Update for next iteration */ \
131 PrevVal = Val; \
132 \
133 /* Create histograms */ \
134 h0[*p++]++; h1[*p++]++; \
135 } \
136 \
137 /* If all input values are already sorted, we just have to return and leave the */ \
138 /* previous list unchanged. That way the routine may take advantage of temporal */ \
139 /* coherence, for example when used to sort transparent faces. */ \
140 if(AlreadySorted) \
141 { \
142 for(SizeT i=0;i<nb;++i) mRanks[i] = i; \
143 return mRanks; \
144 } \
145 } \
146 else \
147 { \
148 /* Prepare for temporal coherence */ \
149 const T* Indices = mRanks; \
150 type PrevVal = (type)buffer[*Indices]; \
151 \
152 while(p!=pe) \
153 { \
154 /* Read input buffer in previous sorted order */ \
155 const type Val = (type)buffer[*Indices++]; \
156 /* Check whether already sorted or not */ \
157 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
158 /* Update for next iteration */ \
159 PrevVal = Val; \
160 \
161 /* Create histograms */ \
162 h0[*p++]++; h1[*p++]++; \
163 } \
164 \
165 /* If all input values are already sorted, we just have to return and leave the */ \
166 /* previous list unchanged. That way the routine may take advantage of temporal */ \
167 /* coherence, for example when used to sort transparent faces. */ \
168 if(AlreadySorted) return mRanks; \
169 } \
170 \
171 /* Else there has been an early out and we must finish computing the histograms */ \
172 while(p!=pe) \
173 { \
174 /* Create histograms without the previous overhead */ \
175 h0[*p++]++; h1[*p++]++; \
176 }
177
178 #define CREATE_HISTOGRAMS4(type, buffer) \
179 /* Clear counters/histograms */ \
180 memset(Histogram, 0, 256*4*sizeof(T)); \
181 \
182 /* Prepare to count */ \
183 const DByte* p = (const DByte*)input; \
184 const DByte* pe = &p[nb*4]; \
185 T* h0 = &Histogram[H0_OFFSET4]; /* Histogram for first pass (LSB) */ \
186 T* h1 = &Histogram[H1_OFFSET4]; /* Histogram for second pass */ \
187 T* h2 = &Histogram[H2_OFFSET4]; /* Histogram for third pass */ \
188 T* h3 = &Histogram[H3_OFFSET4]; /* Histogram for last pass (MSB) */ \
189 \
190 bool AlreadySorted = true; /* Optimism... */ \
191 \
192 if(ranksUnInitialized) \
193 { \
194 /* Prepare for temporal coherence */ \
195 const type* Running = (type*)buffer; \
196 type PrevVal = *Running; \
197 \
198 while(p!=pe) \
199 { \
200 /* Read input buffer in previous sorted order */ \
201 const type Val = *Running++; \
202 /* Check whether already sorted or not */ \
203 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
204 /* Update for next iteration */ \
205 PrevVal = Val; \
206 \
207 /* Create histograms */ \
208 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
209 } \
210 \
211 /* If all input values are already sorted, we just have to return and leave the */ \
212 /* previous list unchanged. That way the routine may take advantage of temporal */ \
213 /* coherence, for example when used to sort transparent faces. */ \
214 if(AlreadySorted) \
215 { \
216 for(SizeT i=0;i<nb;++i) mRanks[i] = i; \
217 return mRanks; \
218 } \
219 } \
220 else \
221 { \
222 /* Prepare for temporal coherence */ \
223 const T* Indices = mRanks; \
224 type PrevVal = (type)buffer[*Indices]; \
225 \
226 while(p!=pe) \
227 { \
228 /* Read input buffer in previous sorted order */ \
229 const type Val = (type)buffer[*Indices++]; \
230 /* Check whether already sorted or not */ \
231 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
232 /* Update for next iteration */ \
233 PrevVal = Val; \
234 \
235 /* Create histograms */ \
236 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
237 } \
238 \
239 /* If all input values are already sorted, we just have to return and leave the */ \
240 /* previous list unchanged. That way the routine may take advantage of temporal */ \
241 /* coherence, for example when used to sort transparent faces. */ \
242 if(AlreadySorted) return mRanks; \
243 } \
244 \
245 /* Else there has been an early out and we must finish computing the histograms */ \
246 while(p!=pe) \
247 { \
248 /* Create histograms without the previous overhead */ \
249 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
250 }
251
252 #define CREATE_HISTOGRAMS8(type, buffer) \
253 /* Clear counters/histograms */ \
254 memset(Histogram, 0, 256*8*sizeof(T)); \
255 \
256 /* Prepare to count */ \
257 const DByte* p = (const DByte*)input; \
258 const DByte* pe = &p[nb*8]; \
259 T* h0 = &Histogram[H0_OFFSET8]; /* Histogram for first pass (LSB) */ \
260 T* h1 = &Histogram[H1_OFFSET8]; /* */ \
261 T* h2 = &Histogram[H2_OFFSET8]; /* */ \
262 T* h3 = &Histogram[H3_OFFSET8]; /* */ \
263 T* h4 = &Histogram[H4_OFFSET8]; /* */ \
264 T* h5 = &Histogram[H5_OFFSET8]; /* */ \
265 T* h6 = &Histogram[H6_OFFSET8]; /* */ \
266 T* h7 = &Histogram[H7_OFFSET8]; /* Histogram for last pass (MSB) */ \
267 \
268 bool AlreadySorted = true; /* Optimism... */ \
269 \
270 if(ranksUnInitialized) \
271 { \
272 /* Prepare for temporal coherence */ \
273 const type* Running = (type*)buffer; \
274 type PrevVal = *Running; \
275 \
276 while(p!=pe) \
277 { \
278 /* Read input buffer in previous sorted order */ \
279 const type Val = *Running++; \
280 /* Check whether already sorted or not */ \
281 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
282 /* Update for next iteration */ \
283 PrevVal = Val; \
284 \
285 /* Create histograms */ \
286 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
287 h4[*p++]++; h5[*p++]++; h6[*p++]++; h7[*p++]++; \
288 } \
289 \
290 /* If all input values are already sorted, we just have to return and leave the */ \
291 /* previous list unchanged. That way the routine may take advantage of temporal */ \
292 /* coherence, for example when used to sort transparent faces. */ \
293 if(AlreadySorted) \
294 { \
295 for(SizeT i=0;i<nb;++i) mRanks[i] = i; \
296 return mRanks; \
297 } \
298 } \
299 else \
300 { \
301 /* Prepare for temporal coherence */ \
302 const T* Indices = mRanks; \
303 type PrevVal = (type)buffer[*Indices]; \
304 \
305 while(p!=pe) \
306 { \
307 /* Read input buffer in previous sorted order */ \
308 const type Val = (type)buffer[*Indices++]; \
309 /* Check whether already sorted or not */ \
310 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
311 /* Update for next iteration */ \
312 PrevVal = Val; \
313 \
314 /* Create histograms */ \
315 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
316 h4[*p++]++; h5[*p++]++; h6[*p++]++; h7[*p++]++; \
317 } \
318 \
319 /* If all input values are already sorted, we just have to return and leave the */ \
320 /* previous list unchanged. That way the routine may take advantage of temporal */ \
321 /* coherence, for example when used to sort transparent faces. */ \
322 if(AlreadySorted) return mRanks; \
323 } \
324 \
325 /* Else there has been an early out and we must finish computing the histograms */ \
326 while(p!=pe) \
327 { \
328 /* Create histograms without the previous overhead */ \
329 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
330 h4[*p++]++; h5[*p++]++; h6[*p++]++; h7[*p++]++; \
331 }
332 // the optimization in case of 'already sorted (sub)array' cannot hold for floats and double
333 // as the NaNs do NOT compare to anything, and the 'already sorted' would be wrong.
334
335 #define CREATE_HISTOGRAMSFLOAT(type, buffer) \
336 /* Clear counters/histograms */ \
337 memset(Histogram, 0, 256*4*sizeof(T)); \
338 \
339 /* Prepare to count */ \
340 const DByte* p = (const DByte*)input; \
341 const DByte* pe = &p[nb*4]; \
342 T* h0 = &Histogram[H0_OFFSET4]; /* Histogram for first pass (LSB) */ \
343 T* h1 = &Histogram[H1_OFFSET4]; /* Histogram for second pass */ \
344 T* h2 = &Histogram[H2_OFFSET4]; /* Histogram for third pass */ \
345 T* h3 = &Histogram[H3_OFFSET4]; /* Histogram for last pass (MSB) */ \
346 \
347 bool AlreadySorted = true; /* Optimism... */ \
348 \
349 if(ranksUnInitialized) \
350 { \
351 /* Prepare for temporal coherence */ \
352 const type* Running = (type*)buffer; \
353 type PrevVal = *Running; \
354 if (std::isnan(PrevVal)) {AlreadySorted=false;} else { \
355 while(p!=pe) \
356 { \
357 /* Read input buffer in previous sorted order */ \
358 const type Val = *Running++; \
359 /* Check whether already sorted or not, care of Nans if necessary*/ \
360 if(Val<PrevVal||std::isnan(Val)) { AlreadySorted = false; break; } /* Early out */ \
361 /* Update for next iteration */ \
362 PrevVal = Val; \
363 \
364 /* Create histograms */ \
365 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
366 } \
367 } \
368 /* If all input values are already sorted, we just have to return and leave the */ \
369 /* previous list unchanged. That way the routine may take advantage of temporal */ \
370 /* coherence, for example when used to sort transparent faces. */ \
371 if(AlreadySorted) \
372 { \
373 for(SizeT i=0;i<nb;++i) mRanks[i] = i; \
374 return mRanks; \
375 } \
376 } \
377 else \
378 { \
379 /* Prepare for temporal coherence */ \
380 const T* Indices = mRanks; \
381 type PrevVal = (type)buffer[*Indices]; \
382 \
383 while(p!=pe) \
384 { \
385 /* Read input buffer in previous sorted order */ \
386 const type Val = (type)buffer[*Indices++]; \
387 /* Check whether already sorted or not */ \
388 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
389 /* Update for next iteration */ \
390 PrevVal = Val; \
391 \
392 /* Create histograms */ \
393 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
394 } \
395 \
396 /* If all input values are already sorted, we just have to return and leave the */ \
397 /* previous list unchanged. That way the routine may take advantage of temporal */ \
398 /* coherence, for example when used to sort transparent faces. */ \
399 if(AlreadySorted) return mRanks; \
400 } \
401 \
402 /* Else there has been an early out and we must finish computing the histograms */ \
403 while(p!=pe) \
404 { \
405 /* Create histograms without the previous overhead */ \
406 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
407 }
408
409 #define CREATE_HISTOGRAMSDOUBLE(type, buffer) \
410 /* Clear counters/histograms */ \
411 memset(Histogram, 0, 256*8*sizeof(T)); \
412 \
413 /* Prepare to count */ \
414 const DByte* p = (const DByte*)input; \
415 const DByte* pe = &p[nb*8]; \
416 T* h0 = &Histogram[H0_OFFSET8]; /* Histogram for first pass (LSB) */ \
417 T* h1 = &Histogram[H1_OFFSET8]; /* */ \
418 T* h2 = &Histogram[H2_OFFSET8]; /* */ \
419 T* h3 = &Histogram[H3_OFFSET8]; /* */ \
420 T* h4 = &Histogram[H4_OFFSET8]; /* */ \
421 T* h5 = &Histogram[H5_OFFSET8]; /* */ \
422 T* h6 = &Histogram[H6_OFFSET8]; /* */ \
423 T* h7 = &Histogram[H7_OFFSET8]; /* Histogram for last pass (MSB) */ \
424 \
425 bool AlreadySorted = true; /* Optimism... */ \
426 \
427 if(ranksUnInitialized) \
428 { \
429 /* Prepare for temporal coherence */ \
430 const type* Running = (type*)buffer; \
431 type PrevVal = *Running; \
432 if (std::isnan(PrevVal)) {AlreadySorted=false;} else { \
433 while(p!=pe) \
434 { \
435 /* Read input buffer in previous sorted order */ \
436 const type Val = *Running++; \
437 /* Check whether already sorted or not, care of Nans if necessary */ \
438 if (Val<PrevVal||std::isnan(Val)) { AlreadySorted = false; break; } /* Early out */\
439 /* Update for next iteration */ \
440 PrevVal = Val; \
441 \
442 /* Create histograms */ \
443 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
444 h4[*p++]++; h5[*p++]++; h6[*p++]++; h7[*p++]++; \
445 } \
446 } \
447 /* If all input values are already sorted, we just have to return and leave the */ \
448 /* previous list unchanged. That way the routine may take advantage of temporal */ \
449 /* coherence, for example when used to sort transparent faces. */ \
450 if(AlreadySorted) \
451 { \
452 for(SizeT i=0;i<nb;++i) mRanks[i] = i; \
453 return mRanks; \
454 } \
455 } \
456 else \
457 { \
458 /* Prepare for temporal coherence */ \
459 const T* Indices = mRanks; \
460 type PrevVal = (type)buffer[*Indices]; \
461 \
462 while(p!=pe) \
463 { \
464 /* Read input buffer in previous sorted order */ \
465 const type Val = (type)buffer[*Indices++]; \
466 /* Check whether already sorted or not */ \
467 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
468 /* Update for next iteration */ \
469 PrevVal = Val; \
470 \
471 /* Create histograms */ \
472 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
473 h4[*p++]++; h5[*p++]++; h6[*p++]++; h7[*p++]++; \
474 } \
475 \
476 /* If all input values are already sorted, we just have to return and leave the */ \
477 /* previous list unchanged. That way the routine may take advantage of temporal */ \
478 /* coherence, for example when used to sort transparent faces. */ \
479 if(AlreadySorted) return mRanks; \
480 } \
481 \
482 /* Else there has been an early out and we must finish computing the histograms */ \
483 while(p!=pe) \
484 { \
485 /* Create histograms without the previous overhead */ \
486 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
487 h4[*p++]++; h5[*p++]++; h6[*p++]++; h7[*p++]++; \
488 }
489
490 #define CHECK_PASS_VALIDITY(pass) \
491 /* Shortcut to current counters */ \
492 const T* CurCount = &Histogram[pass<<8]; \
493 \
494 /* Reset flag. The sorting pass is supposed to be performed. (default) */ \
495 bool PerformPass = true; \
496 \
497 /* Check pass validity */ \
498 \
499 /* If all values have the same byte, sorting is useless. */ \
500 /* It may happen when sorting bytes or words instead of dwords. */ \
501 /* This routine actually sorts words faster than dwords, and bytes */ \
502 /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */ \
503 /* for words and O(n) for bytes. Running time for floats depends on actual values... */ \
504 \
505 /* Get first byte */ \
506 const DByte UniqueVal = *(((DByte*)input)+pass); \
507 \
508 /* Check that byte's counter */ \
509 if(CurCount[UniqueVal]==nb) PerformPass=false;
510
511 template<typename T>
RadixSort(const double * inputDouble,SizeT nb)512 static T* RadixSort(const double* inputDouble, SizeT nb)
513 {
514 const DULong64* input = (const DULong64*)inputDouble;
515 T* mRanks=(T*)gdlAlignedMalloc(nb*sizeof(T));
516 T* mRanks2=(T*)gdlAlignedMalloc(nb*sizeof(T));
517 // Allocate histograms & offsets on the stack
518 T Histogram[256*8];
519 T* Link[256];
520 bool ranksUnInitialized=true;
521
522 { CREATE_HISTOGRAMSDOUBLE(double, inputDouble); }
523
524 // Radix sort, j is the pass number (0=LSB, 7=MSB)
525 for(int j=0;j<8;++j)
526 {
527 // Should we care about negative values?
528 if(j!=7)
529 {
530 // Here we deal with positive values only
531 CHECK_PASS_VALIDITY(j);
532
533 if(PerformPass)
534 {
535 // Create offsets
536 Link[0] = mRanks2;
537 for(SizeT i=1;i<256;++i) Link[i] = Link[i-1] + CurCount[i-1];
538
539 // Perform Radix Sort
540 const DByte* InputBytes = (const DByte*)input;
541 InputBytes += BYTES_INC8;
542 if(ranksUnInitialized)
543 {
544 for(SizeT i=0;i<nb;++i) *Link[InputBytes[i<<3]]++ = i;
545 ranksUnInitialized=false;
546 }
547 else
548 {
549 const T* Indices = mRanks;
550 const T* IndicesEnd = &mRanks[nb];
551 while(Indices!=IndicesEnd)
552 {
553 const T id = *Indices++;
554 *Link[InputBytes[id<<3]]++ = id;
555 }
556 }
557
558 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
559 T* Tmp = mRanks;
560 mRanks = mRanks2;
561 mRanks2 = Tmp;
562 }
563 }
564 else
565 {
566 // This is a special case to correctly handle negative values
567 CHECK_PASS_VALIDITY(j);
568
569 if(PerformPass)
570 {
571 // Compute #negative values involved if needed
572 T NbNegativeValues = 0;
573 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
574 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
575 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
576 // ### is that ok on Apple ?!
577 const T* h7 = &Histogram[H7_OFFSET8];
578 for(int i=128;i<256;++i) NbNegativeValues += h7[i]; // 768 for last histogram, 128 for negative part
579
580 // Create biased offsets, in order for negative numbers to be sorted as well
581 Link[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
582 for(int i=1;i<128;++i) Link[i] = Link[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
583
584 // We must reverse the sorting order for negative numbers!
585 Link[255] = mRanks2;
586 for(int i=0;i<127;++i) Link[254-i] = Link[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
587 for(int i=128;i<256;++i) Link[i] += CurCount[i]; // Fixing the wrong place for negative values
588
589 // Perform Radix Sort
590 if(ranksUnInitialized)
591 {
592 for(SizeT i=0;i<nb;++i)
593 {
594 const T Radix = input[i]>>56; // Radix byte, same as above. AND is useless here (udword).
595 // ### cmp to be killed. Not good. Later.
596 if(Radix<128) *Link[Radix]++ = i; // Number is positive, same as above
597 else *(--Link[Radix]) = i; // Number is negative, flip the sorting order
598 }
599 ranksUnInitialized=false;
600 }
601 else
602 {
603 for(SizeT i=0;i<nb;++i)
604 {
605 const T Radix = input[mRanks[i]]>>56; // Radix byte, same as above. AND is useless here (udword).
606 // ### cmp to be killed. Not good. Later.
607 if(Radix<128) *Link[Radix]++ = mRanks[i]; // Number is positive, same as above
608 else *(--Link[Radix]) = mRanks[i]; // Number is negative, flip the sorting order
609 }
610 }
611 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
612 T* Tmp = mRanks;
613 mRanks = mRanks2;
614 mRanks2 = Tmp;
615 }
616 else
617 {
618 // The pass is useless, yet we still have to reverse the order of current list if all values are negative.
619 if(UniqueVal>=128)
620 {
621 if(ranksUnInitialized)
622 {
623 // ###Possible?
624 for(SizeT i=0;i<nb;++i) mRanks2[i] = nb-i-1;
625 ranksUnInitialized=false;
626 }
627 else
628 {
629 for(SizeT i=0;i<nb;++i) mRanks2[i] = mRanks[nb-i-1];
630 }
631
632 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
633 T* Tmp = mRanks;
634 mRanks = mRanks2;
635 mRanks2 = Tmp;
636 }
637 }
638 }
639 }
640 gdlAlignedFree(mRanks2);
641 return mRanks;
642 }
643
644 template<typename T>
RadixSort(const float * inputFloat,SizeT nb)645 static T* RadixSort(const float* inputFloat, SizeT nb)
646 {
647 const DULong* input = (const DULong*)inputFloat;
648 T* mRanks=(T*)gdlAlignedMalloc(nb*sizeof(T));
649 T* mRanks2=(T*)gdlAlignedMalloc(nb*sizeof(T));
650 // Allocate histograms & offsets on the stack
651 T Histogram[256*4];
652 T* Link[256];
653 bool ranksUnInitialized=true;
654
655 // Create histograms (counters). Counters for all passes are created in one run.
656 // Pros: read input buffer once instead of four times
657 // Cons: mHistogram is 4Kb instead of 1Kb
658 // Floating-point values are always supposed to be signed values, so there's only one code path there.
659 // Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
660 // is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
661 // generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
662 // wouldn't work with mixed positive/negative values....
663 { CREATE_HISTOGRAMSFLOAT(float, inputFloat); }
664
665 // Radix sort, j is the pass number (0=LSB, 3=MSB)
666 for(int j=0;j<4;++j)
667 {
668 // Should we care about negative values?
669 if(j!=3)
670 {
671 // Here we deal with positive values only
672 CHECK_PASS_VALIDITY(j);
673
674 if(PerformPass)
675 {
676 // Create offsets
677 Link[0] = mRanks2;
678 for(SizeT i=1;i<256;++i) Link[i] = Link[i-1] + CurCount[i-1];
679
680 // Perform Radix Sort
681 const DByte* InputBytes = (const DByte*)input;
682 InputBytes += BYTES_INC4;
683 if(ranksUnInitialized)
684 {
685 for(SizeT i=0;i<nb;++i) *Link[InputBytes[i<<2]]++ = i;
686 ranksUnInitialized=false;
687 }
688 else
689 {
690 const T* Indices = mRanks;
691 const T* IndicesEnd = &mRanks[nb];
692 while(Indices!=IndicesEnd)
693 {
694 const T id = *Indices++;
695 *Link[InputBytes[id<<2]]++ = id;
696 }
697 }
698
699 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
700 T* Tmp = mRanks;
701 mRanks = mRanks2;
702 mRanks2 = Tmp;
703 }
704 }
705 else
706 {
707 // This is a special case to correctly handle negative values
708 CHECK_PASS_VALIDITY(j);
709
710 if(PerformPass)
711 {
712 // Compute #negative values involved if needed
713 T NbNegativeValues = 0;
714 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
715 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
716 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
717 // ### is that ok on Apple ?!
718 const T* h3 = &Histogram[H3_OFFSET4];
719 for(int i=128;i<256;++i) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
720
721 // Create biased offsets, in order for negative numbers to be sorted as well
722 Link[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
723 for(int i=1;i<128;++i) Link[i] = Link[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
724
725 // We must reverse the sorting order for negative numbers!
726 Link[255] = mRanks2;
727 for(int i=0;i<127;++i) Link[254-i] = Link[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
728 for(int i=128;i<256;++i) Link[i] += CurCount[i]; // Fixing the wrong place for negative values
729
730 // Perform Radix Sort
731 if(ranksUnInitialized)
732 {
733 for(SizeT i=0;i<nb;++i)
734 {
735 const DULong Radix = input[i]>>24; // Radix byte, same as above. AND is useless here (udword).
736 // ### cmp to be killed. Not good. Later.
737 if(Radix<128) *Link[Radix]++ = i; // Number is positive, same as above
738 else *(--Link[Radix]) = i; // Number is negative, flip the sorting order
739 }
740 ranksUnInitialized=false;
741 }
742 else
743 {
744 for(SizeT i=0;i<nb;++i)
745 {
746 const DULong Radix = input[mRanks[i]]>>24; // Radix byte, same as above. AND is useless here (udword).
747 // ### cmp to be killed. Not good. Later.
748 if(Radix<128) *Link[Radix]++ = mRanks[i]; // Number is positive, same as above
749 else *(--Link[Radix]) = mRanks[i]; // Number is negative, flip the sorting order
750 }
751 }
752 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
753 T* Tmp = mRanks;
754 mRanks = mRanks2;
755 mRanks2 = Tmp;
756 }
757 else
758 {
759 // The pass is useless, yet we still have to reverse the order of current list if all values are negative.
760 if(UniqueVal>=128)
761 {
762 if(ranksUnInitialized)
763 {
764 // ###Possible?
765 for(SizeT i=0;i<nb;++i) mRanks2[i] = nb-i-1;
766 ranksUnInitialized=false;
767 }
768 else
769 {
770 for(SizeT i=0;i<nb;++i) mRanks2[i] = mRanks[nb-i-1];
771 }
772
773 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
774 T* Tmp = mRanks;
775 mRanks = mRanks2;
776 mRanks2 = Tmp;
777 }
778 }
779 }
780 }
781 gdlAlignedFree(mRanks2);
782 return mRanks;
783 }
784
785 template<typename T>
RadixSort(const DLong64 * input,SizeT nb)786 static T* RadixSort(const DLong64* input, SizeT nb)
787 {
788 T* mRanks=(T*)gdlAlignedMalloc(nb*sizeof(T));
789 T* mRanks2=(T*)gdlAlignedMalloc(nb*sizeof(T));
790 // Allocate histograms & offsets on the stack
791 T Histogram[256*8];
792 T* Link[256];
793 bool ranksUnInitialized=true;
794
795 CREATE_HISTOGRAMS8(DLong64, input);
796
797 // Radix sort, j is the pass number (0=LSB, 7=MSB)
798 for(int j=0;j<8;++j)
799 {
800 CHECK_PASS_VALIDITY(j);
801
802 // Sometimes the last (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
803 // not a problem, numbers are correctly sorted anyway.
804 if(PerformPass)
805 {
806 // Should we care about negative values?
807 if(j!=7)
808 {
809 // Here we deal with positive values only
810
811 // Create offsets
812 Link[0] = mRanks2;
813 for(int i=1;i<256;++i) Link[i] = Link[i-1] + CurCount[i-1];
814 }
815 else
816 {
817 // This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
818
819 // Compute #negative values involved if needed
820 T NbNegativeValues = 0;
821 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
822 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
823 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
824 const T* h7 = &Histogram[H7_OFFSET8];
825 for(int i=128;i<256;++i) NbNegativeValues += h7[i]; // 768 for last histogram, 128 for negative part
826
827 // Create biased offsets, in order for negative numbers to be sorted as well
828 Link[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
829 for(int i=1;i<128;++i) Link[i] = Link[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
830
831 // Fixing the wrong place for negative values
832 Link[128] = mRanks2;
833 for(int i=129;i<256;++i) Link[i] = Link[i-1] + CurCount[i-1];
834 }
835
836 // Perform Radix Sort
837 const DByte* InputBytes = (const DByte*)input;
838 InputBytes += BYTES_INC8;
839 if(ranksUnInitialized)
840 {
841 for(SizeT i=0;i<nb;++i) *Link[InputBytes[i<<3]]++ = i;
842 ranksUnInitialized=false;
843 }
844 else
845 {
846 const T* Indices = mRanks;
847 const T* IndicesEnd = &mRanks[nb];
848 while(Indices!=IndicesEnd)
849 {
850 const T id = *Indices++;
851 *Link[InputBytes[id<<3]]++ = id;
852 }
853 }
854
855 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
856 T* Tmp = mRanks;
857 mRanks = mRanks2;
858 mRanks2 = Tmp;
859 }
860 }
861 gdlAlignedFree(mRanks2);
862 return mRanks;
863 }
864
865 template<typename T>
RadixSort(const DULong64 * input,SizeT nb)866 static T* RadixSort(const DULong64* input, SizeT nb)
867 {
868 T* mRanks=(T*)gdlAlignedMalloc(nb*sizeof(T));
869 T* mRanks2=(T*)gdlAlignedMalloc(nb*sizeof(T));
870 // Allocate histograms & offsets on the stack
871 T Histogram[256*8];
872 T* Link[256];
873 bool ranksUnInitialized=true;
874
875 CREATE_HISTOGRAMS8(DULong64, input);
876
877 // Radix sort, j is the pass number (0=LSB, 7=MSB)
878 for(int j=0;j<8;++j)
879 {
880 CHECK_PASS_VALIDITY(j);
881
882 // Sometimes the last (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
883 // not a problem, numbers are correctly sorted anyway.
884 if(PerformPass)
885 {
886 // Here we deal with positive values only
887
888 // Create offsets
889 Link[0] = mRanks2;
890 for(int i=1;i<256;++i) Link[i] = Link[i-1] + CurCount[i-1];
891
892 // Perform Radix Sort
893 const DByte* InputBytes = (const DByte*)input;
894 InputBytes += BYTES_INC8;
895 if(ranksUnInitialized)
896 {
897 for(SizeT i=0;i<nb;++i) *Link[InputBytes[i<<3]]++ = i;
898 ranksUnInitialized=false;
899 }
900 else
901 {
902 const T* Indices = mRanks;
903 const T* IndicesEnd = &mRanks[nb];
904 while(Indices!=IndicesEnd)
905 {
906 const T id = *Indices++;
907 *Link[InputBytes[id<<3]]++ = id;
908 }
909 }
910
911 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
912 T* Tmp = mRanks;
913 mRanks = mRanks2;
914 mRanks2 = Tmp;
915 }
916 }
917 gdlAlignedFree(mRanks2);
918 return mRanks;
919 }
920
921 template<typename T>
RadixSort(const DLong * input,SizeT nb)922 static T* RadixSort(const DLong* input, SizeT nb)
923 {
924 T* mRanks=(T*)gdlAlignedMalloc(nb*sizeof(T));
925 T* mRanks2=(T*)gdlAlignedMalloc(nb*sizeof(T));
926 // Allocate histograms & offsets on the stack
927 T Histogram[256*4];
928 T* Link[256];
929 bool ranksUnInitialized=true;
930
931 // Create histograms (counters). Counters for all passes are created in one run.
932 // Pros: read input buffer once instead of four times
933 // Cons: mHistogram is 4Kb instead of 1Kb
934 // We must take care of signed/unsigned values for temporal coherence.... I just
935 // have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
936
937 CREATE_HISTOGRAMS4(DLong, input);
938
939 // Radix sort, j is the pass number (0=LSB, 3=MSB)
940 for(int j=0;j<4;++j)
941 {
942 CHECK_PASS_VALIDITY(j);
943
944 // Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
945 // not a problem, numbers are correctly sorted anyway.
946 if(PerformPass)
947 {
948 // Should we care about negative values?
949 if(j!=3)
950 {
951 // Here we deal with positive values only
952
953 // Create offsets
954 Link[0] = mRanks2;
955 for(int i=1;i<256;++i) Link[i] = Link[i-1] + CurCount[i-1];
956 }
957 else
958 {
959 // This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
960
961 // Compute #negative values involved if needed
962 T NbNegativeValues = 0;
963 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
964 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
965 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
966 const T* h3 = &Histogram[H3_OFFSET4];
967 for(int i=128;i<256;++i) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
968
969 // Create biased offsets, in order for negative numbers to be sorted as well
970 Link[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
971 for(int i=1;i<128;++i) Link[i] = Link[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
972
973 // Fixing the wrong place for negative values
974 Link[128] = mRanks2;
975 for(int i=129;i<256;++i) Link[i] = Link[i-1] + CurCount[i-1];
976 }
977
978 // Perform Radix Sort
979 const DByte* InputBytes = (const DByte*)input;
980 InputBytes += BYTES_INC4;
981 if(ranksUnInitialized)
982 {
983 for(SizeT i=0;i<nb;++i) *Link[InputBytes[i<<2]]++ = i;
984 ranksUnInitialized=false;
985 }
986 else
987 {
988 const T* Indices = mRanks;
989 const T* IndicesEnd = &mRanks[nb];
990 while(Indices!=IndicesEnd)
991 {
992 const T id = *Indices++;
993 *Link[InputBytes[id<<2]]++ = id;
994 }
995 }
996
997 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
998 T* Tmp = mRanks;
999 mRanks = mRanks2;
1000 mRanks2 = Tmp;
1001 }
1002 }
1003 gdlAlignedFree(mRanks2);
1004 return mRanks;
1005 }
1006
1007 template<typename T>
RadixSort(const DULong * input,SizeT nb)1008 static T* RadixSort(const DULong* input, SizeT nb)
1009 {
1010 T* mRanks=(T*)gdlAlignedMalloc(nb*sizeof(T));
1011 T* mRanks2=(T*)gdlAlignedMalloc(nb*sizeof(T));
1012 // Allocate histograms & offsets on the stack
1013 T Histogram[256*4];
1014 T* Link[256];
1015 bool ranksUnInitialized=true;
1016
1017 // Create histograms (counters). Counters for all passes are created in one run.
1018 // Pros: read input buffer once instead of four times
1019 // Cons: mHistogram is 4Kb instead of 1Kb
1020 // We must take care of signed/unsigned values for temporal coherence.... I just
1021 // have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
1022
1023 CREATE_HISTOGRAMS4(DULong, input);
1024
1025 // Radix sort, j is the pass number (0=LSB, 3=MSB)
1026 for(int j=0;j<4;++j)
1027 {
1028 CHECK_PASS_VALIDITY(j);
1029
1030 // Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
1031 // not a problem, numbers are correctly sorted anyway.
1032 if(PerformPass)
1033 {
1034 // Here we deal with positive values only
1035
1036 // Create offsets
1037 Link[0] = mRanks2;
1038 for(int i=1;i<256;++i) Link[i] = Link[i-1] + CurCount[i-1];
1039
1040 // Perform Radix Sort
1041 const DByte* InputBytes = (const DByte*)input;
1042 InputBytes += BYTES_INC4;
1043 if(ranksUnInitialized)
1044 {
1045 for(SizeT i=0;i<nb;++i) *Link[InputBytes[i<<2]]++ = i;
1046 ranksUnInitialized=false;
1047 }
1048 else
1049 {
1050 const T* Indices = mRanks;
1051 const T* IndicesEnd = &mRanks[nb];
1052 while(Indices!=IndicesEnd)
1053 {
1054 const T id = *Indices++;
1055 *Link[InputBytes[id<<2]]++ = id;
1056 }
1057 }
1058
1059 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
1060 T* Tmp = mRanks;
1061 mRanks = mRanks2;
1062 mRanks2 = Tmp;
1063 }
1064 }
1065 gdlAlignedFree(mRanks2);
1066 return mRanks;
1067 }
1068
1069 template<typename T>
RadixSort(const DInt * input,SizeT nb)1070 static T* RadixSort(const DInt* input, SizeT nb)
1071 {
1072 T* mRanks=(T*)gdlAlignedMalloc(nb*sizeof(T));
1073 T* mRanks2=(T*)gdlAlignedMalloc(nb*sizeof(T));
1074 // Allocate histograms & offsets on the stack
1075 T Histogram[256*2];
1076 T* Link[256];
1077 bool ranksUnInitialized=true;
1078
1079 CREATE_HISTOGRAMS2(DInt, input);
1080
1081 // Radix sort, j is the pass number (0=LSB, 1=MSB)
1082 for(int j=0;j<2;++j)
1083 {
1084 CHECK_PASS_VALIDITY(j);
1085
1086 if(PerformPass)
1087 {
1088 // Should we care about negative values?
1089 if(j!=1)
1090 {
1091 // Here we deal with positive values only
1092 // Create offsets
1093 Link[0] = mRanks2;
1094 for(int i=1;i<256;++i) Link[i] = Link[i-1] + CurCount[i-1];
1095 }
1096 else
1097 {
1098 // This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
1099 // Compute #negative values involved if needed
1100 T NbNegativeValues = 0;
1101 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
1102 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
1103 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
1104 const T* h1 = &Histogram[H1_OFFSET2];
1105 for(int i=128;i<256;++i) NbNegativeValues += h1[i]; // 768 for last histogram, 128 for negative part
1106
1107 // Create biased offsets, in order for negative numbers to be sorted as well
1108 Link[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
1109 for(int i=1;i<128;++i) Link[i] = Link[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
1110
1111 // Fixing the wrong place for negative values
1112 Link[128] = mRanks2;
1113 for(int i=129;i<256;++i) Link[i] = Link[i-1] + CurCount[i-1];
1114 }
1115
1116 // Perform Radix Sort
1117 const DByte* InputBytes = (const DByte*)input;
1118 InputBytes += BYTES_INC2;
1119 if(ranksUnInitialized)
1120 {
1121 for(SizeT i=0;i<nb;++i) *Link[InputBytes[i<<1]]++ = i;
1122 ranksUnInitialized=false;
1123 }
1124 else
1125 {
1126 const T* Indices = mRanks;
1127 const T* IndicesEnd = &mRanks[nb];
1128 while(Indices!=IndicesEnd)
1129 {
1130 const T id = *Indices++;
1131 *Link[InputBytes[id<<1]]++ = id;
1132 }
1133 }
1134
1135 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
1136 T* Tmp = mRanks;
1137 mRanks = mRanks2;
1138 mRanks2 = Tmp;
1139 }
1140 }
1141 gdlAlignedFree(mRanks2);
1142 return mRanks;
1143 }
1144
1145 template<typename T>
RadixSort(const DUInt * input,SizeT nb)1146 static T* RadixSort(const DUInt* input, SizeT nb)
1147 {
1148 T* mRanks=(T*)gdlAlignedMalloc(nb*sizeof(T));
1149 T* mRanks2=(T*)gdlAlignedMalloc(nb*sizeof(T));
1150 // Allocate histograms & offsets on the stack
1151 T Histogram[256*2];
1152 T* Link[256];
1153 bool ranksUnInitialized=true;
1154
1155 CREATE_HISTOGRAMS2(DUInt, input);
1156
1157 // Radix sort, j is the pass number (0=LSB, 1=MSB)
1158 for(int j=0;j<2;++j)
1159 {
1160 CHECK_PASS_VALIDITY(j);
1161
1162 if(PerformPass)
1163 {
1164 // Here we deal with positive values only
1165
1166 // Create offsets
1167 Link[0] = mRanks2;
1168 for(int i=1;i<256;++i) Link[i] = Link[i-1] + CurCount[i-1];
1169
1170 // Perform Radix Sort
1171 const DByte* InputBytes = (const DByte*)input;
1172 InputBytes += BYTES_INC2;
1173 if(ranksUnInitialized)
1174 {
1175 for(SizeT i=0;i<nb;++i) *Link[InputBytes[i<<1]]++ = i;
1176 ranksUnInitialized=false;
1177 }
1178 else
1179 {
1180 const T* Indices = mRanks;
1181 const T* IndicesEnd = &mRanks[nb];
1182 while(Indices!=IndicesEnd)
1183 {
1184 const T id = *Indices++;
1185 *Link[InputBytes[id<<1]]++ = id;
1186 }
1187 }
1188
1189 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
1190 T* Tmp = mRanks;
1191 mRanks = mRanks2;
1192 mRanks2 = Tmp;
1193 }
1194 }
1195 gdlAlignedFree(mRanks2);
1196 return mRanks;
1197 }
1198
1199 template<typename T>
RadixSort(const DByte * input,SizeT nb)1200 static T* RadixSort(const DByte* input, SizeT nb)
1201 {
1202 // Allocate histograms & offsets on the stack
1203 T* mRanks=(T*)gdlAlignedMalloc(nb*sizeof(T));
1204 T Histogram[256];
1205 T* Link[256];
1206
1207 /* Clear counters/histograms */
1208 memset(Histogram, 0, 256 * sizeof (T));
1209
1210 /* Prepare to count */
1211 const DByte* p = (const DByte*) input;
1212 const DByte* pe = &p[nb];
1213 T* h0 = Histogram;
1214
1215 bool AlreadySorted = true; /* Optimism... */
1216 for (SizeT i = 0; i < nb; ++i) mRanks[i] = i;
1217
1218 /* Prepare for temporal coherence */
1219 const DByte* Running = (DByte*) input;
1220 DByte PrevVal = *Running;
1221
1222 while (p != pe) {
1223 /* Read input buffer in previous sorted order */
1224 const DByte Val = *Running++;
1225 /* Check whether already sorted or not */
1226 if (Val < PrevVal) {
1227 AlreadySorted = false;
1228 break;
1229 } /* Early out */
1230 /* Update for next iteration */
1231 PrevVal = Val;
1232
1233 /* Create histograms */
1234 h0[*p++]++;
1235 }
1236
1237 /* If all input values are already sorted, we just have to return and leave the */
1238 /* previous list unchanged. That way the routine may take advantage of temporal */
1239 /* coherence, for example when used to sort transparent faces. */
1240 if (AlreadySorted) return mRanks;
1241
1242 /* Else there has been an early out and we must finish computing the histograms */
1243 while (p != pe) {
1244 /* Create histograms without the previous overhead */
1245 h0[*p++]++;
1246 }
1247
1248 /* If all values have the same byte, sorting is useless. */
1249 /* It may happen when sorting bytes or words instead of dwords. */
1250 /* This routine actually sorts words faster than dwords, and bytes */
1251 /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */
1252 /* for words and O(n) for bytes. Running time for floats depends on actual values... */
1253
1254 /* Get first byte */
1255 const DByte UniqueVal = *((DByte*) input);
1256
1257 /* Check that byte's counter */
1258 if (Histogram[UniqueVal] == nb) return mRanks; //already sorted too
1259
1260 // Here we deal with positive values only
1261 // Create offsets
1262 Link[0] = mRanks;
1263 for(int i=1;i<256;++i) Link[i] = Link[i-1] + Histogram[i-1];
1264
1265 // Perform Radix Sort
1266 const DByte* InputBytes = (const DByte*)input;
1267 for(SizeT i=0;i<nb;++i) *Link[InputBytes[i]]++ = i;
1268 return mRanks;
1269 }
1270
1271
1272 template<typename T, typename Q>
RadixSortIndex(const Q * val,T * index,SizeT lo,SizeT hi)1273 static void RadixSortIndex(const Q* val, T* index, SizeT lo, SizeT hi)
1274 {
1275 SizeT length = hi - lo + 1;
1276 T* res = RadixSort<T>(&(val[lo]), length); //sorted indexes are between 0 and length-1
1277 for (SizeT i=0; i< length; ++i) index[lo+i]=res[i]+lo; //we need them between lo and hi...
1278 gdlAlignedFree(res);
1279 }
1280
1281 //-------------------------------------
1282
1283
1284 template<typename T>
less(T & v,T & w)1285 inline bool less (T &v, T &w)
1286 {
1287 return (v < w);
1288 }
1289
1290 template<>
less(DFloat & v,DFloat & w)1291 inline bool less (DFloat &v, DFloat &w)
1292 {
1293 return (v < w || std::isnan(w) );
1294 }
1295
1296 template<>
less(DDouble & v,DDouble & w)1297 inline bool less (DDouble &v, DDouble &w)
1298 {
1299 return (v < w || std::isnan(w) );
1300 }
1301
1302 template<typename T>
leq(T & v,T & w)1303 inline bool leq (T &v, T &w)
1304 {
1305 return (v <= w);
1306 }
1307
1308 template<>
leq(DFloat & v,DFloat & w)1309 inline bool leq (DFloat &v, DFloat &w)
1310 {
1311 return (v <= w || std::isnan(w) );
1312 }
1313
1314 template<>
leq(DDouble & v,DDouble & w)1315 inline bool leq (DDouble &v, DDouble &w)
1316 {
1317 return (v <= w || std::isnan(w) );
1318 }
1319
1320 template<typename T>
geq(T & v,T & w)1321 inline bool geq (T &v, T &w)
1322 {
1323 return (v >= w);
1324 }
1325
1326 template<>
geq(DFloat & v,DFloat & w)1327 inline bool geq (DFloat &v, DFloat &w)
1328 {
1329 return (v >= w || std::isnan(v) );
1330 }
1331
1332 template<>
geq(DDouble & v,DDouble & w)1333 inline bool geq (DDouble &v, DDouble &w)
1334 {
1335 return (v >= w || std::isnan(v) );
1336 }
1337
1338 template<typename T>
eq(T & v,T & w)1339 inline bool eq (T &v, T &w)
1340 {
1341 return w == v;
1342 }
1343
1344 template <typename IndexT>
swap(IndexT * z,SizeT a,SizeT b)1345 inline void swap(IndexT* z , SizeT a, SizeT b)
1346 {
1347 IndexT t = z[a];
1348 z[a] = z[b];
1349 z[b] = t;
1350 }
1351
1352 template<typename T, typename IndexT>
insertionSortIndex(T * val,IndexT * index,SizeT lo,SizeT hi)1353 inline void insertionSortIndex (T* val, IndexT* index, SizeT lo, SizeT hi)
1354 {
1355 for (SizeT i = lo; i <= hi; ++i)
1356 {
1357 for (SizeT j = i; j > lo && less(val[index[j]], val[index[j-1]]); --j)
1358 {
1359 swap(index,j,j-1);
1360 }
1361 }
1362 }
1363
1364 template<typename T, typename IndexT>
median3_for_qsort(T * val,IndexT * index,SizeT indI,SizeT indJ,SizeT indK)1365 inline SizeT median3_for_qsort ( T* val, IndexT* index, SizeT indI, SizeT indJ, SizeT indK)
1366 {
1367 return (less(val[index[ indI]], val[index[ indJ]]) ?
1368 (less(val[index[ indJ]], val[index[ indK]]) ? indJ : less(val[index[ indI]], val[index[ indK]]) ? indK : indI) :
1369 (less(val[index[ indK]], val[index[ indJ]]) ? indJ : less(val[index[ indK]], val[index[ indI]]) ? indK : indI));
1370 }
1371
1372
1373 template <typename T, typename IndexT>
QuickSortIndex(T * val,IndexT * index,DLong left,DLong right)1374 static void QuickSortIndex(T* val, IndexT* index, DLong left, DLong right)
1375 {
1376
1377 if (right <= 0)
1378 return;
1379
1380 DLong length = right - left + 1;
1381
1382 // cutoff to insertion sort
1383 if (length < INSERTION_SORT_THRESHOLD)
1384 {
1385 insertionSortIndex(val, index, left, right);
1386 return;
1387 }
1388
1389 // use median-of-3 as partitioning element
1390 else if (length < INSERTION_SORT_THRESHOLD*4)
1391 {
1392 DLong median = median3_for_qsort(val, index, left, left + length / 2, right);
1393 swap(index,median,left);
1394 }
1395
1396 // use Tukey ninther as partitioning element
1397 else
1398 {
1399 DLong eps = length / 8;
1400 DLong mid = left + length / 2;
1401 DLong mFirst = median3_for_qsort(val, index, left, left + eps, left + eps + eps);
1402 DLong mMid = median3_for_qsort(val, index, mid - eps, mid, mid + eps);
1403 DLong mLast = median3_for_qsort(val, index, right - eps - eps, right - eps, right);
1404 DLong ninther = median3_for_qsort(val, index, mFirst, mMid, mLast);
1405 swap(index,ninther,left);
1406 }
1407
1408 // Bentley-McIlroy 3-way partitioning
1409 DLong i = left, j = right + 1;
1410 DLong index1 = left, index2 = right + 1;
1411
1412 for (;; )
1413 {
1414 T pivot = val[index[left]];
1415 while (less(val[index[++i]], pivot))
1416 {
1417 if (i == right)
1418 break;
1419 }
1420 while (less(pivot, val[index[--j]]))
1421 {
1422 if (j == left)
1423 break;
1424 }
1425 if (i >= j) break;
1426 swap(index,i,j);
1427 if (eq(val[index[i]], pivot))
1428 swap(index,++index1,i);
1429 if (eq(val[index[j]], pivot))
1430 swap(index,--index2,j);
1431 }
1432 swap(index,left,j);
1433
1434 i = j + 1;
1435 j = j - 1;
1436 for (DLong k = left + 1; k <= index1; ++k)
1437 {
1438 swap(index,k,j--);
1439 }
1440 for (DLong k = right ; k >= index2; --k)
1441 {
1442 swap(index,k,i++);
1443 }
1444
1445 // QuickSortIndex(val, index, left, j);
1446 // QuickSortIndex(val, index, i, right);
1447
1448 // same with parallelism
1449 DLong Left[2] = {left, i};
1450 DLong Right[2] = {j, right};
1451 #pragma omp parallel for num_threads(2) if (length >= MERGESORT_PARALLEL_THRESHOLD && CpuTPOOL_NTHREADS > 1)
1452 for (int i = 0; i < 2; i++) QuickSortIndex(val, index, Left[i], Right[i]);
1453 }
1454
1455
1456 template< typename T, typename IndexT>
MergeNoCopyIndexAux(IndexT * aux,IndexT * index,SizeT low,SizeT mid,SizeT high,T * val)1457 static void MergeNoCopyIndexAux(IndexT* aux, IndexT* index, SizeT low, SizeT mid, SizeT high, T* val)
1458 {
1459 SizeT i = low, j = mid + 1;
1460 for (SizeT k = low; k <= high; ++k) {
1461 if (i > mid) index[k] = aux[j++];
1462 else if (j > high) index[k] = aux[i++];
1463 else if (less(val[aux[j]] , val[aux[i]])) index[k] = aux[j++];
1464 else index[k] = aux[i++];
1465 }
1466 }
1467
1468 template< typename T, typename IndexT>
MergeSortIndexAux(IndexT * aux,IndexT * index,SizeT low,SizeT high,T * val)1469 static void MergeSortIndexAux(IndexT* aux, IndexT* index, SizeT low, SizeT high, T* val)
1470 {
1471 SizeT length = high - low + 1;
1472 if (length < 2) return;
1473 if (length < INSERTION_SORT_THRESHOLD) {
1474 insertionSortIndex(val, index, low, high);
1475 memcpy(&(aux[low]), &(index[low]), length*sizeof(IndexT));
1476 return;
1477 }
1478
1479 // else MERGESORT:
1480
1481 // SizeT mid = low + (high - low) / 2;
1482 // MergeSortIndexAux(index, aux, low, mid, val);
1483 // MergeSortIndexAux(index, aux, mid+1, high, val);
1484
1485 // same with parallelism
1486 SizeT mid = low + (high - low) / 2;
1487 SizeT Left[2] = {low, mid + 1};
1488 SizeT Right[2] = {mid, high};
1489 #pragma omp parallel for num_threads(2) if (length >= MERGESORT_PARALLEL_THRESHOLD && CpuTPOOL_NTHREADS > 1)
1490 for (int i = 0; i < 2; i++) MergeSortIndexAux(index, aux, Left[i], Right[i], val);
1491
1492 // If arrays are already sorted, finished. This is an
1493 // optimization that results in faster sorts for nearly ordered lists.
1494 if (val[aux[mid + 1]] >= val[aux[mid]]) {
1495 memcpy(&(index[low]), &(aux[low]), length * sizeof (IndexT)); //give back sub
1496 return;
1497 }
1498
1499 // If arrays are inverted just swap.
1500 if (leq(val[aux[high]], val[aux[low]])) {
1501 SizeT left = mid - low + 1;
1502 SizeT right = high - mid;
1503 // swap parts:
1504 memmove(&(index[low]), &(aux[low]), left * sizeof (IndexT)); //copy 'left' values in aux
1505 memmove(&(aux[low]), &(aux[mid + 1]), right * sizeof (IndexT)); //copy 'right' values starting at low
1506 memmove(&(aux[low + right]), &(index[low]), left * sizeof (IndexT)); //give back aux
1507 memcpy(&(index[low]), &(aux[low]), length * sizeof (IndexT)); //give back sub
1508 return;
1509 }
1510
1511 MergeNoCopyIndexAux(aux, index, low, mid, high, val);
1512 }
1513
1514 //This should not be used for FLOATS and DOUBLES as the sorting order is (better) but different from IDL.
1515 template< typename T, typename IndexT>
AdaptiveSortIndexAux(IndexT * aux,IndexT * index,SizeT low,SizeT high,T * val)1516 static void AdaptiveSortIndexAux(IndexT* aux, IndexT* index, SizeT low, SizeT high, T* val)
1517 {
1518 SizeT length = high - low + 1;
1519 if (length < 2) return;
1520 if (length < INSERTION_SORT_THRESHOLD) {
1521 insertionSortIndex(val, index, low, high);
1522 return;
1523 }
1524 // RadixSort is stable and differentiates -0 and +0.
1525 // InsertionSort and Mergesort are stable but do not differentiate -0 and +0
1526 // Quicksort is not stable https://en.wikipedia.org/wiki/Sorting_algorithm#Stability (does not keep temporal coherence)
1527 // and do not differentiate -0 and +0
1528 // Quicksort should not be permitted for the default sorting algorithms, but since IDL says that
1529 // "If Array contains any identical elements, the order in which the identical elements
1530 // are sorted is arbitrary and may vary between operating systems.", I permit it.
1531
1532 else if (length < RADIX_SORT_THRESHOLD_ADAPT) { //could be faster if alloc/dealloc was not performed in RadixSort...
1533 RadixSortIndex(val, index, low, high);
1534 return;
1535 }
1536 // else MERGESORT
1537 // SizeT mid = low + (high - low) / 2;
1538 // AdaptiveSortIndexAux(index, aux, low, mid, val);
1539 // AdaptiveSortIndexAux(index, aux, mid+1, high, val);
1540
1541 // same with parallelism
1542 SizeT mid = low + (high - low) / 2;
1543 SizeT Left[2] = {low, mid + 1};
1544 SizeT Right[2] = {mid, high};
1545 #pragma omp parallel for num_threads(2) if (length >= MERGESORT_PARALLEL_THRESHOLD && CpuTPOOL_NTHREADS > 1)
1546 for (int i = 0; i < 2; i++) AdaptiveSortIndexAux(index, aux, Left[i], Right[i], val);
1547
1548 // If arrays are already sorted, finished. This is an
1549 // optimization that results in faster sorts for nearly ordered lists. No need to care about NaNs
1550 if (val[aux[mid + 1]] >= val[aux[mid]]) {
1551 memcpy(&(index[low]), &(aux[low]), length * sizeof (IndexT)); //give back sub
1552 return;
1553 }
1554
1555 // If arrays are inverted just swap. No need to care about NaNs
1556 if (val[aux[high]] <= val[aux[low]]) {
1557 SizeT left = mid - low + 1;
1558 SizeT right = high - mid;
1559 // swap parts:
1560 memmove(&(index[low]), &(aux[low]), left * sizeof (IndexT)); //copy 'left' values in aux
1561 memmove(&(aux[low]), &(aux[mid + 1]), right * sizeof (IndexT)); //copy 'right' values starting at low
1562 memmove(&(aux[low + right]), &(index[low]), left * sizeof (IndexT)); //give back aux
1563 memcpy(&(index[low]), &(aux[low]), length * sizeof (IndexT)); //give back sub
1564 return;
1565 }
1566
1567 MergeNoCopyIndexAux(aux, index, low, mid, high, val);
1568 }
1569
1570 // the only difference is that sue to -NaN, RadixSort is forbidden for Floats and Doubles as the ranking with Radix() is not
1571 // the same as for other sorting alogorithms. Pity, as actually it is Better!
1572 template< typename T, typename IndexT>
AdaptiveSortIndexAuxWithNaN(IndexT * aux,IndexT * index,SizeT low,SizeT high,T * val)1573 static void AdaptiveSortIndexAuxWithNaN(IndexT* aux, IndexT* index, SizeT low, SizeT high, T* val) {
1574 SizeT length = high - low + 1;
1575 if (length < 2) return;
1576 if (length < INSERTION_SORT_THRESHOLD) {
1577 insertionSortIndex(val, index, low, high);
1578 return;
1579 }
1580 //ONLY MERGESORT
1581
1582 // SizeT mid = low + (high - low) / 2;
1583 // AdaptiveSortIndexAuxWithNaN(index, aux, low, mid, val);
1584 // AdaptiveSortIndexAuxWithNaN(index, aux, mid+1, high, val);
1585
1586 // same with parallelism
1587 SizeT mid = low + (high - low) / 2;
1588 SizeT Left[2] = {low, mid + 1};
1589 SizeT Right[2] = {mid, high};
1590 #pragma omp parallel for num_threads(2) if (length >= MERGESORT_PARALLEL_THRESHOLD && CpuTPOOL_NTHREADS > 1)
1591 for (int i = 0; i < 2; i++) AdaptiveSortIndexAuxWithNaN(index, aux, Left[i], Right[i], val);
1592
1593 // If arrays are already sorted, finished. This is an
1594 // optimization that results in faster sorts for nearly ordered lists.
1595 if (geq(val[aux[mid + 1]] ,val[aux[mid]])) {
1596 memcpy(&(index[low]), &(aux[low]), length * sizeof (IndexT)); //give back sub
1597 return;
1598 }
1599
1600 // If arrays are inverted just swap.
1601 if (leq(val[aux[high]], val[aux[low]])) {
1602 SizeT left = mid - low + 1;
1603 SizeT right = high - mid;
1604 // swap parts:
1605 memmove(&(index[low]), &(aux[low]), left * sizeof (IndexT)); //copy 'left' values in aux
1606 memmove(&(aux[low]), &(aux[mid + 1]), right * sizeof (IndexT)); //copy 'right' values starting at low
1607 memmove(&(aux[low + right]), &(index[low]), left * sizeof (IndexT)); //give back aux
1608 memcpy(&(index[low]), &(aux[low]), length * sizeof (IndexT)); //give back sub
1609 return;
1610 }
1611
1612 MergeNoCopyIndexAux(aux, index, low, mid, high, val);
1613 }
1614
1615 template< typename T, typename IndexT>
MergeSortIndex(T * val,IndexT * index,SizeT low,SizeT high)1616 inline void MergeSortIndex(T* val, IndexT* index, SizeT low, SizeT high)
1617 {
1618 IndexT* aux = new IndexT[high - low + 1];
1619 for (SizeT i = 0; i < high - low + 1; ++i) aux[i] = i;
1620 MergeSortIndexAux(aux, index, low, high, val);
1621 delete[] aux;
1622 }
1623
1624 template< typename T, typename IndexT>
AdaptiveSortIndex(T * val,IndexT * index,SizeT low,SizeT high)1625 inline void AdaptiveSortIndex(T* val, IndexT* index, SizeT low, SizeT high)
1626 {
1627 IndexT* aux = new IndexT[high - low + 1];
1628 for (SizeT i = 0; i < high - low + 1; ++i) aux[i] = i;
1629 AdaptiveSortIndexAux(aux, index, low, high, val);
1630 delete[] aux;
1631 }
1632
1633 template< typename T, typename IndexT>
AdaptiveSortIndexWithNaN(T * val,IndexT * index,SizeT low,SizeT high)1634 inline void AdaptiveSortIndexWithNaN(T* val, IndexT* index, SizeT low, SizeT high)
1635 {
1636 IndexT* aux = new IndexT[high - low + 1];
1637 for (SizeT i = 0; i < high - low + 1; ++i) aux[i] = i;
1638 AdaptiveSortIndexAuxWithNaN(aux, index, low, high, val);
1639 delete[] aux;
1640 }
1641 //--------------------------------------------------------------------------------------------------------------------
1642 // Sorting algos: The "private" GDL_SORT enables keywords QUICK,MERGE,RADIX,INSERT. Those are not there to for the user
1643 // to choose the algo (s)he wants. They are primarily to test the relative speed of each of them and find, for a given machine,
1644 // where the default algorithm, that is basically a clever combinaison of all of them, should switch from algo to algo
1645 // in order of retain the maximum efficiency.
1646 // One could use testsuite/benchmark/compare_sort_algos.pro to, e.g., update the magic values in the code, or, better, this can
1647 // be done automatically via a training procedure, writing results in a file in ~/.gdl and the present code could use, for example
1648 // values of !GDL_SORT_THRESHOLDS, a system variable (struct) whose values would be updated in the ~/.gdl file.
1649 // Default values chosen are for my intel i7, 4 cores, 8 threads.
1650
gdl_sort_fun(EnvT * e)1651 BaseGDL* gdl_sort_fun( EnvT* e)
1652 {
1653 e->NParam(1);
1654
1655 BaseGDL* p0 = e->GetParDefined(0);
1656
1657 if (p0->Type() == GDL_STRUCT)
1658 e->Throw("Struct expression not allowed in this context: " +
1659 e->GetParString(0));
1660
1661
1662 static int l64Ix = e->KeywordIx("L64");
1663 bool l64 = e->KeywordSet(l64Ix);
1664
1665 static int qsortIx = e->KeywordIx("QUICK");
1666 bool quick = e->KeywordSet(qsortIx);
1667 static int mergeIx = e->KeywordIx("MERGE");
1668 bool merge = e->KeywordSet(mergeIx);
1669 static int radixIx = e->KeywordIx("RADIX");
1670 bool radix = e->KeywordSet(radixIx);
1671 static int insertIx = e->KeywordIx("INSERT");
1672 bool insert = e->KeywordSet(insertIx);
1673 static int autoIx = e->KeywordIx("AUTO");
1674 bool doauto = e->KeywordSet(autoIx);
1675 if (!(radix || quick ||merge|| insert || doauto)) e->Throw("I need one of QUICK, MERGE, RADIX, INSERT or AUTO keyword set.");
1676
1677 SizeT nEl = p0->N_Elements();
1678 if (insert && nEl > 50000) e->Throw("INSERT would take forever on "+i2s(nEl)+" elements");
1679 //Radix sort is special in that it is not obvious ---due to shortcuts--- which of the 2 internal storage
1680 //arrays is returned as sorted index.
1681 if (radix) {
1682 DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::NOALLOC);
1683 DLong* index;
1684 if (p0->Type() == GDL_DOUBLE) {
1685 DDouble* val = (DDouble*) (static_cast<DDoubleGDL*> (p0)->DataAddr());
1686 index = (DLong*) RadixSort<DULong>(val, nEl);
1687 } else if (p0->Type() == GDL_FLOAT) {
1688 DFloat* val = (DFloat*) (static_cast<DFloatGDL*> (p0)->DataAddr());
1689 index = (DLong*) RadixSort<DULong>(val, nEl);
1690 } else if (p0->Type() == GDL_LONG) {
1691 DLong* val = (DLong*) (static_cast<DLongGDL*> (p0)->DataAddr());
1692 index = (DLong*) RadixSort<DULong>(val, nEl);
1693 } else if (p0->Type() == GDL_ULONG) {
1694 DULong* val = (DULong*) (static_cast<DULongGDL*> (p0)->DataAddr());
1695 index = (DLong*) RadixSort<DULong>(val, nEl);
1696 } else if (p0->Type() == GDL_LONG64) {
1697 DLong64* val = (DLong64*) (static_cast<DLong64GDL*> (p0)->DataAddr());
1698 index = (DLong*) RadixSort<DULong>(val, nEl);
1699 } else if (p0->Type() == GDL_ULONG64) {
1700 DULong64* val = (DULong64*) (static_cast<DULong64GDL*> (p0)->DataAddr());
1701 index = (DLong*) RadixSort<DULong>(val, nEl);
1702 } else if (p0->Type() == GDL_INT) {
1703 DInt* val = (DInt*) (static_cast<DIntGDL*> (p0)->DataAddr());
1704 index = (DLong*) RadixSort<DULong>(val, nEl);
1705 } else if (p0->Type() == GDL_UINT) {
1706 DUInt* val = (DUInt*) (static_cast<DUIntGDL*> (p0)->DataAddr());
1707 index = (DLong*) RadixSort<DULong>(val, nEl);
1708 } else if (p0->Type() == GDL_BYTE) {
1709 DByte* val = (DByte*) (static_cast<DByteGDL*> (p0)->DataAddr());
1710 index = (DLong*) RadixSort<DULong>(val, nEl);
1711 } else if (p0->Type() == GDL_COMPLEX) {
1712 DComplexGDL* p0F = static_cast<DComplexGDL*> (p0);
1713 DComplex *ff = (DComplex*) p0F->DataAddr();
1714 // create temp values for magnitude of complex
1715 DFloat* magnitude = new DFloat[nEl];
1716 for (SizeT i = 0; i < nEl; ++i) magnitude[i] = std::norm(ff[i]);
1717 index = (DLong*) RadixSort<DULong>(magnitude, nEl);
1718 delete[] magnitude;
1719 } else if (p0->Type() == GDL_COMPLEXDBL) {
1720 DComplexDblGDL* p0F = static_cast<DComplexDblGDL*> (p0);
1721 DComplexDbl *ff = (DComplexDbl*) p0F->DataAddr();
1722 // create temp values for magnitude of complex
1723 DDouble* magnitude = new DDouble[nEl];
1724 for (SizeT i = 0; i < nEl; ++i) magnitude[i] = std::norm(ff[i]);
1725 index = (DLong*) RadixSort<DULong>(magnitude, nEl);
1726 delete[] magnitude;
1727 }
1728 res->SetBuffer(index);
1729 res->SetBufferSize(nEl);
1730 res->SetDim(dimension(nEl));
1731 return res;
1732 } else {
1733 if (p0->Type() == GDL_DOUBLE) {
1734 DDouble* val = (DDouble*) (static_cast<DDoubleGDL*> (p0)->DataAddr());
1735 DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1736 // NaNs are not well handled by the other sorts...
1737 DLong *hh = static_cast<DLong*> (res->DataAddr());
1738 SizeT low = 0;
1739 SizeT high = nEl - 1;
1740 if (merge) {
1741 MergeSortIndex(val, hh, low, high);
1742 } else if (quick) {
1743 QuickSortIndex(val, hh, low, high);
1744 } else if (insert) {
1745 insertionSortIndex(val, hh, low, high);
1746 } else {
1747 AdaptiveSortIndexWithNaN(val, hh, low, high);
1748 }
1749 return res;
1750 } else if (p0->Type() == GDL_FLOAT) {
1751 DFloat* val = (DFloat*) (static_cast<DFloatGDL*> (p0)->DataAddr());
1752 DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1753 // NaNs are not well handled by the other sorts...
1754 DLong *hh = static_cast<DLong*> (res->DataAddr());
1755 SizeT low = 0;
1756 SizeT high = nEl - 1;
1757 if (merge) {
1758 MergeSortIndex(val, hh, low, high);
1759 } else if (quick) {
1760 QuickSortIndex(val, hh, low, high);
1761 } else if (insert) {
1762 insertionSortIndex(val, hh, low, high);
1763 } else {
1764 AdaptiveSortIndexWithNaN(val, hh, low, high);
1765 }
1766 return res;
1767 } else if (p0->Type() == GDL_LONG) {
1768 DLong* val = (DLong*) (static_cast<DLongGDL*> (p0)->DataAddr());
1769 DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1770 DLong *hh = static_cast<DLong*> (res->DataAddr());
1771 SizeT low = 0;
1772 SizeT high = nEl - 1;
1773 if (merge) {
1774 MergeSortIndex(val, hh, low, high);
1775 } else if (quick) {
1776 QuickSortIndex(val, hh, low, high);
1777 } else if (insert) {
1778 insertionSortIndex(val, hh, low, high);
1779 } else {
1780 AdaptiveSortIndex(val, hh, low, high);
1781 }
1782 return res;
1783 } else if (p0->Type() == GDL_ULONG) {
1784 DULong* val = (DULong*) (static_cast<DULongGDL*> (p0)->DataAddr());
1785 DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1786 DLong *hh = static_cast<DLong*> (res->DataAddr());
1787 SizeT low = 0;
1788 SizeT high = nEl - 1;
1789 if (merge) {
1790 MergeSortIndex(val, hh, low, high);
1791 } else if (quick) {
1792 QuickSortIndex(val, hh, low, high);
1793 } else if (insert) {
1794 insertionSortIndex(val, hh, low, high);
1795 } else {
1796 AdaptiveSortIndex(val, hh, low, high);
1797 }
1798 return res;
1799 } else if (p0->Type() == GDL_LONG64) {
1800 DLong64* val = (DLong64*) (static_cast<DLong64GDL*> (p0)->DataAddr());
1801 DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1802 DLong *hh = static_cast<DLong*> (res->DataAddr());
1803 SizeT low = 0;
1804 SizeT high = nEl - 1;
1805 if (merge) {
1806 MergeSortIndex(val, hh, low, high);
1807 } else if (quick) {
1808 QuickSortIndex(val, hh, low, high);
1809 } else if (insert) {
1810 insertionSortIndex(val, hh, low, high);
1811 } else {
1812 AdaptiveSortIndex(val, hh, low, high);
1813 }
1814 return res;
1815 } else if (p0->Type() == GDL_ULONG64) {
1816 DULong64* val = (DULong64*) (static_cast<DULong64GDL*> (p0)->DataAddr());
1817 DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1818 DLong *hh = static_cast<DLong*> (res->DataAddr());
1819 SizeT low = 0;
1820 SizeT high = nEl - 1;
1821 if (merge) {
1822 MergeSortIndex(val, hh, low, high);
1823 } else if (quick) {
1824 QuickSortIndex(val, hh, low, high);
1825 } else if (insert) {
1826 insertionSortIndex(val, hh, low, high);
1827 } else {
1828 AdaptiveSortIndex(val, hh, low, high);
1829 }
1830 return res;
1831 } else if (p0->Type() == GDL_INT) {
1832 DInt* val = (DInt*) (static_cast<DIntGDL*> (p0)->DataAddr());
1833 DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1834 DLong *hh = static_cast<DLong*> (res->DataAddr());
1835 SizeT low = 0;
1836 SizeT high = nEl - 1;
1837 if (merge) {
1838 MergeSortIndex(val, hh, low, high);
1839 } else if (quick) {
1840 QuickSortIndex(val, hh, low, high);
1841 } else if (insert) {
1842 insertionSortIndex(val, hh, low, high);
1843 } else {
1844 AdaptiveSortIndex(val, hh, low, high);
1845 }
1846 return res;
1847 } else if (p0->Type() == GDL_UINT) {
1848 DUInt* val = (DUInt*) (static_cast<DUIntGDL*> (p0)->DataAddr());
1849 DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1850 DLong *hh = static_cast<DLong*> (res->DataAddr());
1851 SizeT low = 0;
1852 SizeT high = nEl - 1;
1853 if (merge) {
1854 MergeSortIndex(val, hh, low, high);
1855 } else if (quick) {
1856 QuickSortIndex(val, hh, low, high);
1857 } else if (insert) {
1858 insertionSortIndex(val, hh, low, high);
1859 } else {
1860 AdaptiveSortIndex(val, hh, low, high);
1861 }
1862 return res;
1863 } else if (p0->Type() == GDL_BYTE) {
1864 DByte* val = (DByte*) (static_cast<DByteGDL*> (p0)->DataAddr());
1865 DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1866 DLong *hh = static_cast<DLong*> (res->DataAddr());
1867 SizeT low = 0;
1868 SizeT high = nEl - 1;
1869 if (merge) {
1870 MergeSortIndex(val, hh, low, high);
1871 } else if (quick) {
1872 QuickSortIndex(val, hh, low, high);
1873 } else if (insert) {
1874 insertionSortIndex(val, hh, low, high);
1875 } else {
1876 AdaptiveSortIndex(val, hh, low, high);
1877 }
1878 return res;
1879 } else if (p0->Type() == GDL_COMPLEX) {
1880 DComplexGDL* p0F = static_cast<DComplexGDL*> (p0);
1881 DComplex *ff = (DComplex*) p0F->DataAddr();
1882 // create temp values for magnitude of complex
1883 DFloat* magnitude = new DFloat[nEl];
1884 for (SizeT i = 0; i < nEl; ++i) magnitude[i] = std::norm(ff[i]);
1885 DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1886 DLong *hh = static_cast<DLong*> (res->DataAddr());
1887 SizeT low = 0;
1888 SizeT high = nEl - 1;
1889 if (merge) {
1890 MergeSortIndex(magnitude, hh, low, high);
1891 } else if (quick) {
1892 QuickSortIndex(magnitude, hh, low, high);
1893 } else if (insert) {
1894 insertionSortIndex(magnitude, hh, low, high);
1895 } else {
1896 AdaptiveSortIndexWithNaN(magnitude, hh, low, high);
1897 }
1898 delete[] magnitude;
1899 return res;
1900 } else if (p0->Type() == GDL_COMPLEXDBL) {
1901 DComplexDblGDL* p0F = static_cast<DComplexDblGDL*> (p0);
1902 DComplexDbl *ff = (DComplexDbl*) p0F->DataAddr();
1903 // create temp values for magnitude of complex
1904 DDouble* magnitude = new DDouble[nEl];
1905 for (SizeT i = 0; i < nEl; ++i) magnitude[i] = std::norm(ff[i]);
1906 DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1907 DLong *hh = static_cast<DLong*> (res->DataAddr());
1908 SizeT low = 0;
1909 SizeT high = nEl - 1;
1910 if (merge) {
1911 MergeSortIndex(magnitude, hh, low, high);
1912 } else if (quick) {
1913 QuickSortIndex(magnitude, hh, low, high);
1914 } else if (insert) {
1915 insertionSortIndex(magnitude, hh, low, high);
1916 } else {
1917 AdaptiveSortIndexWithNaN(magnitude, hh, low, high);
1918 }
1919 delete[] magnitude;
1920 return res;
1921 } else e->Throw("FIXME.");
1922 }
1923 return NULL;
1924 }
1925
1926 template<typename GDLIndexT, typename IndexT>
do_sort_fun(BaseGDL * p0)1927 inline BaseGDL* do_sort_fun(BaseGDL* p0)
1928 {
1929 SizeT nEl = p0->N_Elements();
1930 if (p0->Type() == GDL_BYTE) { //lack of 'res' creation overhead makes "Bytes Radix Sort" better than anything else.
1931 DByte* val = (DByte*)(static_cast<DByteGDL*>(p0)->DataAddr());
1932 GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::NOALLOC);
1933 IndexT *index;
1934 index=RadixSort<IndexT>( val, nEl);
1935 res->SetBuffer(index);
1936 res->SetBufferSize(nEl);
1937 res->SetDim(dimension(nEl));
1938 return res;
1939 } else if (p0->Type() == GDL_INT) { //still true for INTs
1940 DInt* val = (DInt*)(static_cast<DIntGDL*>(p0)->DataAddr());
1941 GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::NOALLOC);
1942 IndexT *index;
1943 index=RadixSort<IndexT>( val, nEl);
1944 res->SetBuffer(index);
1945 res->SetBufferSize(nEl);
1946 res->SetDim(dimension(nEl));
1947 return res;
1948 } else if (p0->Type() == GDL_UINT) { //still true for UINTs
1949 DUInt* val = (DUInt*)(static_cast<DUIntGDL*>(p0)->DataAddr());
1950 GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::NOALLOC);
1951 IndexT *index;
1952 index=RadixSort<IndexT>( val, nEl);
1953 res->SetBuffer(index);
1954 res->SetBufferSize(nEl);
1955 res->SetDim(dimension(nEl));
1956 return res;
1957 } else if (p0->Type() == GDL_FLOAT) { //adaptive, due to the -NaN different sorting
1958 DFloat* val = (DFloat*)(static_cast<DFloatGDL*>(p0)->DataAddr());
1959 GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
1960 IndexT *hh = static_cast<IndexT*> (res->DataAddr());
1961 SizeT low=0;
1962 SizeT high=nEl-1;
1963 AdaptiveSortIndexWithNaN<DFloat, IndexT>( val, hh, low, high);
1964 return res;
1965 } else if (p0->Type() == GDL_DOUBLE) { //adaptive, due to the -NaN different sorting
1966 DDouble* val = (DDouble*)(static_cast<DDoubleGDL*>(p0)->DataAddr());
1967 GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
1968 IndexT *hh = static_cast<IndexT*> (res->DataAddr());
1969 SizeT low=0;
1970 SizeT high=nEl-1;
1971 AdaptiveSortIndexWithNaN<DDouble, IndexT>( val, hh, low, high);
1972 return res;
1973 } else if (p0->Type() == GDL_COMPLEX) {//adaptive, due to the -NaN different sorting
1974 DComplexGDL* p0F = static_cast<DComplexGDL*> (p0);
1975 DComplex *ff=(DComplex*)p0F->DataAddr();
1976 // create temp values for magnitude of complex
1977 DFloat* magnitude=new DFloat[nEl];
1978 for (SizeT i=0; i< nEl; ++i) magnitude[i]=std::norm(ff[i]);
1979 GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
1980 IndexT *hh = static_cast<IndexT*> (res->DataAddr());
1981 SizeT low=0;
1982 SizeT high=nEl-1;
1983 AdaptiveSortIndexWithNaN<DFloat, IndexT>( magnitude, hh, low, high);
1984 delete[] magnitude;
1985 return res;
1986 } else if (p0->Type() == GDL_COMPLEXDBL) {//adaptive, due to the -NaN different sorting
1987 DComplexDblGDL* p0F = static_cast<DComplexDblGDL*> (p0);
1988 DComplexDbl *ff=(DComplexDbl*)p0F->DataAddr();
1989 // create temp values for magnitude of complex
1990 DDouble* magnitude=new DDouble[nEl];
1991 for (SizeT i=0; i< nEl; ++i) magnitude[i]=std::norm(ff[i]);
1992 GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
1993 IndexT *hh = static_cast<IndexT*> (res->DataAddr());
1994 SizeT low=0;
1995 SizeT high=nEl-1;
1996 AdaptiveSortIndexWithNaN<DDouble, IndexT>( magnitude, hh, low, high);
1997 delete[] magnitude;
1998 return res;
1999 } else if (p0->Type() == GDL_LONG && nEl > RADIX_SORT_THRESHOLD_FOR_LONG) {
2000 DLong* val = (DLong*)(static_cast<DLongGDL*>(p0)->DataAddr());
2001 GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
2002 IndexT *hh = static_cast<IndexT*> (res->DataAddr());
2003 SizeT low=0;
2004 SizeT high=nEl-1;
2005 AdaptiveSortIndex<DLong, IndexT>( val, hh, low, high);
2006 return res;
2007 } else if (p0->Type() == GDL_LONG && nEl <= RADIX_SORT_THRESHOLD_FOR_LONG) {
2008 DLong* val = (DLong*)(static_cast<DLongGDL*>(p0)->DataAddr());
2009 GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::NOALLOC);
2010 IndexT *index;
2011 index=RadixSort<IndexT>( val, nEl);
2012 res->SetBuffer(index);
2013 res->SetBufferSize(nEl);
2014 res->SetDim(dimension(nEl));
2015 return res;
2016 } else if (p0->Type() == GDL_ULONG && nEl > RADIX_SORT_THRESHOLD_FOR_ULONG) {
2017 DULong* val = (DULong*)(static_cast<DULongGDL*>(p0)->DataAddr());
2018 GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
2019 IndexT *hh = static_cast<IndexT*> (res->DataAddr());
2020 SizeT low=0;
2021 SizeT high=nEl-1;
2022 AdaptiveSortIndex<DULong, IndexT>( val, hh, low, high);
2023 return res;
2024 } else if (p0->Type() == GDL_ULONG && nEl <= RADIX_SORT_THRESHOLD_FOR_ULONG) {
2025 DULong* val = (DULong*)(static_cast<DULongGDL*>(p0)->DataAddr());
2026 GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::NOALLOC);
2027 IndexT *index;
2028 index=RadixSort<IndexT>( val, nEl);
2029 res->SetBuffer(index);
2030 res->SetBufferSize(nEl);
2031 res->SetDim(dimension(nEl));
2032 return res;
2033 } else if (p0->Type() == GDL_LONG64 && nEl > RADIX_SORT_THRESHOLD_FOR_LONG64) {
2034 DLong64* val = (DLong64*)(static_cast<DLong64GDL*>(p0)->DataAddr());
2035 GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
2036 IndexT *hh = static_cast<IndexT*> (res->DataAddr());
2037 SizeT low=0;
2038 SizeT high=nEl-1;
2039 AdaptiveSortIndex<DLong64, IndexT>( val, hh, low, high);
2040 return res;
2041 } else if (p0->Type() == GDL_LONG64 && nEl <= RADIX_SORT_THRESHOLD_FOR_LONG64) {
2042 DLong64* val = (DLong64*)(static_cast<DLong64GDL*>(p0)->DataAddr());
2043 GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::NOALLOC);
2044 IndexT *index;
2045 index=RadixSort<IndexT>( val, nEl);
2046 res->SetBuffer(index);
2047 res->SetBufferSize(nEl);
2048 res->SetDim(dimension(nEl));
2049 return res;
2050 } else if (p0->Type() == GDL_ULONG64 && nEl > RADIX_SORT_THRESHOLD_FOR_ULONG64) {
2051 DULong64* val = (DULong64*)(static_cast<DULong64GDL*>(p0)->DataAddr());
2052 GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
2053 IndexT *hh = static_cast<IndexT*> (res->DataAddr());
2054 SizeT low=0;
2055 SizeT high=nEl-1;
2056 AdaptiveSortIndex<DULong64, IndexT>( val, hh, low, high);
2057 return res;
2058 } else if (p0->Type() == GDL_ULONG64 && nEl <= RADIX_SORT_THRESHOLD_FOR_ULONG64) {
2059 DULong64* val = (DULong64*)(static_cast<DULong64GDL*>(p0)->DataAddr());
2060 GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::NOALLOC);
2061 IndexT *index;
2062 index=RadixSort<IndexT>( val, nEl);
2063 res->SetBuffer(index);
2064 res->SetBufferSize(nEl);
2065 res->SetDim(dimension(nEl));
2066 return res;
2067 } else if (p0->Type() == GDL_STRING) {
2068 DString* val = (DString*)(static_cast<DStringGDL*>(p0)->DataAddr());
2069 GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
2070 IndexT *hh = static_cast<IndexT*> (res->DataAddr());
2071 SizeT low=0;
2072 SizeT high=nEl-1;
2073 QuickSortIndex<DString, IndexT>( val, hh, low, high);
2074 return res;
2075 } else if (p0->Type() == GDL_PTR) {
2076 // actually it sorts the index in heap.
2077 DPtr* val = (DPtr*)(static_cast<DPtrGDL*>(p0)->DataAddr()); //heap indexes
2078 GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
2079 IndexT *hh = static_cast<IndexT*> (res->DataAddr());
2080 SizeT low=0;
2081 SizeT high=nEl-1;
2082 AdaptiveSortIndex<DPtr, IndexT>( val, hh, low, high);
2083 return res;
2084 } else if (p0->Type() == GDL_OBJ) {
2085 // idem?
2086 DObj* val = (DObj*)(static_cast<DObjGDL*>(p0)->DataAddr()); //heap indexes
2087 GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
2088 IndexT *hh = static_cast<IndexT*> (res->DataAddr());
2089 SizeT low=0;
2090 SizeT high=nEl-1;
2091 AdaptiveSortIndex<DObj, IndexT>( val, hh, low, high);
2092 return res;
2093 }
2094 return NULL;
2095 }
sort_fun(EnvT * e)2096 BaseGDL* sort_fun( EnvT* e)
2097 {
2098 BaseGDL* p0 = e->GetParDefined(0);
2099 if (p0->Type() == GDL_STRUCT) e->Throw("Struct expression not allowed in this context: " +e->GetParString(0));
2100 static int l64Ix = e->KeywordIx("L64");
2101 bool l64 = e->KeywordSet(l64Ix);
2102 if (!l64) return do_sort_fun<DLongGDL,DLong>(p0);
2103 else return do_sort_fun<DLong64GDL,DLong64>(p0);
2104 }
2105 }
2106