1 /***************************************************************************
2                           sorting.cpp  -  all things sortlike
3                              -------------------
4     begin                : Nove 2018
5     copyright            : (C) 2018 by G. Duvert and Others
6     email                : gilles dot duvert at free dot fr
7  ***************************************************************************/
8 
9 /***************************************************************************
10  *                                                                         *
11  *   This program is free software; you can redistribute it and/or modify  *
12  *   it under the terms of the GNU General Public License as published by  *
13  *   the Free Software Foundation; either version 2 of the License, or     *
14  *   (at your option) any later version.                                   *
15  *                                                                         *
16  ***************************************************************************/
17 
18 #include "includefirst.hpp"
19 
20 #include "datatypes.hpp"
21 #include "envt.hpp"
22 #include "dinterpreter.hpp"
23 
24 namespace lib {
25 #define INSERTION_SORT_THRESHOLD 256  //after, merge is better (floats)
26 #define QUICK_SORT_THRESHOLD 0 //never better
27 #define RADIX_SORT_THRESHOLD_ADAPT 2000000 //for Adaptive value, non floats
28 #define RADIX_SORT_THRESHOLD_ADAPT_FLOAT 600000 //NOT USED for Adaptive value, floats (but we do not use Radix or floats in adaptive mode due to the -NaN feature)
29 #define RADIX_SORT_THRESHOLD_FOR_FLOAT 30000000 //NOT USED radix vs. idl (but we do not use Radix for floats due to the -NaN feature)
30 #define RADIX_SORT_THRESHOLD_FOR_DOUBLE 6000000 //idem
31 //radix is better than anything from 0 to this value for all integer types
32 #define RADIX_SORT_THRESHOLD_FOR_LONG 3000000
33 #define RADIX_SORT_THRESHOLD_FOR_ULONG 3000000
34 #define RADIX_SORT_THRESHOLD_FOR_LONG64 2000000
35 #define RADIX_SORT_THRESHOLD_FOR_ULONG64 1000000
36 #define MERGESORT_PARALLEL_THRESHOLD 1000000
37 
38 // The following GDL version of SORT() is a complete rewriting using modern methods for a faster sort.
39 // It provides a noticeable, sometimes even huge, speed gain, depending on array sizes and types.
40 // This version uses
41 // 1) modified code from https://github.com/Pierre-Terdiman/RadixRedux
42 // (adapted to GDL interfaces and other types than floats and longs) plus
43 // 2) new versions of other sorting methods, as the various codes found on the web do not sort indexes
44 // but values and do not sort NaNs at all.
45 // I've added a GDL_SORT() "private" function that permits to use one of: /RADIX, /QUICK, /MERGE and /INSERT
46 // sorting method. This function is used in the procedure testsuite/benchmark/compare_sort_algos
47 // to plot the performances of all the different sorts (insertion is not used as it is in N^2 and would take forever)
48 // and adjust the combination of all these methods that makes for the performance in the final sort().
49 // With a modicum of work, this could be used to fine-tune the SORT() method for a particular machine or purpose,
50 // maintaining top performances in specific cases (clusters etc).
51 //------------------Radix Sorting Codes ------------------------------------------------------------------------------
52 // Here is the copyright for the Radix code by PT. It is understood that Pierre's code was for Floats, Longs and ULongs only.
53 //Copyright (c) 2018 Pierre Terdiman - http://www.codercorner.com/blog
54 //
55 //
56 //This software is provided 'as-is', without any express or implied warranty.
57 //In no event will the authors be held liable for any damages arising from the use of this software.
58 //Permission is granted to anyone to use this software for any purpose, including commercial applications,
59 //and to alter it and redistribute it freely, subject to the following restrictions:
60 //1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software.
61 //If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
62 //2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
63 //3. This notice may not be removed or altered from any source distribution.
64 //------------------Radix Sorting Codes ------------------------------------------------------------------------------
65 //
66 #if defined(IS_BIGENDIAN)
67 	#define H0_OFFSET2	256
68 	#define H1_OFFSET2	0
69         #define BYTES_INC2	(1-j)
70     #define H0_OFFSET4	768
71 	#define H1_OFFSET4	512
72 	#define H2_OFFSET4	256
73 	#define H3_OFFSET4	0
74 	#define BYTES_INC4	(3-j)
75 	#define H7_OFFSET8	0
76 	#define H6_OFFSET8	256
77 	#define H5_OFFSET8	512
78 	#define H4_OFFSET8	768
79 	#define H3_OFFSET8	1024
80 	#define H2_OFFSET8	1280
81 	#define H1_OFFSET8	1536
82 	#define H0_OFFSET8	1792
83 	#define BYTES_INC8	(7-j)
84 #else
85 	#define H0_OFFSET2	0
86 	#define H1_OFFSET2	256
87 	#define BYTES_INC2	j
88 	#define H0_OFFSET4	0
89 	#define H1_OFFSET4	256
90 	#define H2_OFFSET4	512
91 	#define H3_OFFSET4	768
92 	#define BYTES_INC4	j
93 	#define H0_OFFSET8	0
94 	#define H1_OFFSET8	256
95 	#define H2_OFFSET8	512
96 	#define H3_OFFSET8	768
97 	#define H4_OFFSET8	1024
98 	#define H5_OFFSET8	1280
99 	#define H6_OFFSET8	1536
100 	#define H7_OFFSET8	1792
101 	#define BYTES_INC8	j
102 #endif
103 
104 #include <string.h> //for memset (Windows)
105 
106 #define CREATE_HISTOGRAMS2(type, buffer)													\
107 	/* Clear counters/histograms */															\
108 	memset(Histogram, 0, 256*2*sizeof(T));                                                 \
109 																							\
110 	/* Prepare to count */																	\
111 	const DByte* p = (const DByte*)input;													\
112 	const DByte* pe = &p[nb*2];																\
113 	T* h0 = &Histogram[H0_OFFSET2];	/* Histogram for first pass (LSB)	*/                  \
114 	T* h1 = &Histogram[H1_OFFSET2];	/* Histogram for last pass (MSB)	*/                  \
115 																							\
116 	bool AlreadySorted = true;	/* Optimism... */											\
117 																							\
118 	if(ranksUnInitialized)																    \
119 	{																						\
120 		/* Prepare for temporal coherence */												\
121 		const type* Running = (type*)buffer;												\
122 		type PrevVal = *Running;															\
123 																							\
124 		while(p!=pe)																		\
125 		{																					\
126 			/* Read input buffer in previous sorted order */								\
127 			const type Val = *Running++;													\
128 			/* Check whether already sorted or not */										\
129 			if(Val<PrevVal)	{ AlreadySorted = false; break; } /* Early out */				\
130 			/* Update for next iteration */													\
131 			PrevVal = Val;																	\
132 																							\
133 			/* Create histograms */															\
134 			h0[*p++]++;	h1[*p++]++;                         								\
135 		}																					\
136 																							\
137 		/* If all input values are already sorted, we just have to return and leave the */	\
138 		/* previous list unchanged. That way the routine may take advantage of temporal */	\
139 		/* coherence, for example when used to sort transparent faces.					*/	\
140 		if(AlreadySorted)																	\
141 		{																					\
142 			for(SizeT i=0;i<nb;++i)	mRanks[i] = i;										    \
143 			return mRanks;																	\
144 		}																					\
145 	}																						\
146 	else																					\
147 	{																						\
148 		/* Prepare for temporal coherence */												\
149 		const T* Indices = mRanks;                                                          \
150 		type PrevVal = (type)buffer[*Indices];												\
151 																							\
152 		while(p!=pe)																		\
153 		{																					\
154 			/* Read input buffer in previous sorted order */								\
155 			const type Val = (type)buffer[*Indices++];										\
156 			/* Check whether already sorted or not */										\
157 			if(Val<PrevVal)	{ AlreadySorted = false; break; } /* Early out */				\
158 			/* Update for next iteration */													\
159 			PrevVal = Val;																	\
160 																							\
161 			/* Create histograms */															\
162 			h0[*p++]++;	h1[*p++]++;                        									\
163 		}																					\
164 																							\
165 		/* If all input values are already sorted, we just have to return and leave the */	\
166 		/* previous list unchanged. That way the routine may take advantage of temporal */	\
167 		/* coherence, for example when used to sort transparent faces.					*/	\
168 		if(AlreadySorted) return mRanks;                       							    \
169 	}																						\
170 																							\
171 	/* Else there has been an early out and we must finish computing the histograms */		\
172 	while(p!=pe)																			\
173 	{																						\
174 		/* Create histograms without the previous overhead */								\
175 		h0[*p++]++;	h1[*p++]++;	                       										\
176     }
177 
178 #define CREATE_HISTOGRAMS4(type, buffer)													\
179 	/* Clear counters/histograms */															\
180 	memset(Histogram, 0, 256*4*sizeof(T));                                             	\
181 																							\
182 	/* Prepare to count */																	\
183 	const DByte* p = (const DByte*)input;													\
184 	const DByte* pe = &p[nb*4];																\
185 	T* h0 = &Histogram[H0_OFFSET4];	/* Histogram for first pass (LSB)	*/                  \
186 	T* h1 = &Histogram[H1_OFFSET4];	/* Histogram for second pass		*/              	\
187 	T* h2 = &Histogram[H2_OFFSET4];	/* Histogram for third pass			*/              	\
188 	T* h3 = &Histogram[H3_OFFSET4];	/* Histogram for last pass (MSB)	*/                	\
189 																							\
190 	bool AlreadySorted = true;	/* Optimism... */											\
191 																							\
192 	if(ranksUnInitialized)																    \
193 	{																						\
194 		/* Prepare for temporal coherence */												\
195 		const type* Running = (type*)buffer;												\
196 		type PrevVal = *Running;															\
197 																							\
198 		while(p!=pe)																		\
199 		{																					\
200 			/* Read input buffer in previous sorted order */								\
201 			const type Val = *Running++;													\
202 			/* Check whether already sorted or not */										\
203       if(Val<PrevVal)	{ AlreadySorted = false; break; } /* Early out */				\
204 			/* Update for next iteration */													\
205 			PrevVal = Val;																	\
206 																							\
207 			/* Create histograms */															\
208 			h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;									\
209 		}																					\
210 																							\
211 		/* If all input values are already sorted, we just have to return and leave the */	\
212 		/* previous list unchanged. That way the routine may take advantage of temporal */	\
213 		/* coherence, for example when used to sort transparent faces.					*/	\
214 		if(AlreadySorted)																	\
215 		{																					\
216 			for(SizeT i=0;i<nb;++i)	mRanks[i] = i;										    \
217 			return mRanks;															        \
218 		}																					\
219 	}																						\
220 	else																					\
221 	{																						\
222 		/* Prepare for temporal coherence */												\
223 		const T* Indices = mRanks;                                                        	\
224 		type PrevVal = (type)buffer[*Indices];												\
225 																							\
226 		while(p!=pe)																		\
227 		{																					\
228 			/* Read input buffer in previous sorted order */								\
229 			const type Val = (type)buffer[*Indices++];										\
230 			/* Check whether already sorted or not */										\
231 			if(Val<PrevVal)	{ AlreadySorted = false; break; } /* Early out */				\
232 			/* Update for next iteration */													\
233 			PrevVal = Val;																	\
234 																							\
235 			/* Create histograms */															\
236 			h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;									\
237 		}																					\
238 																							\
239 		/* If all input values are already sorted, we just have to return and leave the */	\
240 		/* previous list unchanged. That way the routine may take advantage of temporal */	\
241 		/* coherence, for example when used to sort transparent faces.					*/	\
242 		if(AlreadySorted) return mRanks;                   									\
243 	}																						\
244 																							\
245 	/* Else there has been an early out and we must finish computing the histograms */		\
246 	while(p!=pe)																			\
247 	{																						\
248 		/* Create histograms without the previous overhead */								\
249 		h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;										\
250     }
251 
252 #define CREATE_HISTOGRAMS8(type, buffer)													\
253 	/* Clear counters/histograms */															\
254 	memset(Histogram, 0, 256*8*sizeof(T));                                                  \
255 																							\
256 	/* Prepare to count */																	\
257 	const DByte* p = (const DByte*)input;													\
258 	const DByte* pe = &p[nb*8];																\
259 	T* h0 = &Histogram[H0_OFFSET8];	/* Histogram for first pass (LSB)	*/                  \
260 	T* h1 = &Histogram[H1_OFFSET8];	/*                                  */                  \
261 	T* h2 = &Histogram[H2_OFFSET8];	/*                                  */                  \
262           T* h3 = &Histogram[H3_OFFSET8];	/*                                  */			\
263           T* h4 = &Histogram[H4_OFFSET8];	/*                                  */			\
264           T* h5 = &Histogram[H5_OFFSET8];	/*                                  */			\
265           T* h6 = &Histogram[H6_OFFSET8];	/*                                  */			\
266           T* h7 = &Histogram[H7_OFFSET8];	/* Histogram for last pass (MSB)	*/			\
267 																							\
268 	bool AlreadySorted = true;	/* Optimism... */											\
269 																							\
270 	if(ranksUnInitialized)																    \
271 	{																						\
272 		/* Prepare for temporal coherence */												\
273 		const type* Running = (type*)buffer;												\
274 		type PrevVal = *Running;															\
275 																							\
276 		while(p!=pe)																		\
277 		{																					\
278 			/* Read input buffer in previous sorted order */								\
279 			const type Val = *Running++;													\
280 			/* Check whether already sorted or not */										\
281       if(Val<PrevVal)	{ AlreadySorted = false; break; } /* Early out */				\
282 			/* Update for next iteration */													\
283 			PrevVal = Val;																	\
284 																							\
285 			/* Create histograms */															\
286 			h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;									\
287 			h4[*p++]++;	h5[*p++]++;	h6[*p++]++;	h7[*p++]++;									\
288 		}																					\
289 																							\
290 		/* If all input values are already sorted, we just have to return and leave the */	\
291 		/* previous list unchanged. That way the routine may take advantage of temporal */	\
292 		/* coherence, for example when used to sort transparent faces.					*/	\
293 		if(AlreadySorted)																	\
294 		{																					\
295 			for(SizeT i=0;i<nb;++i)	mRanks[i] = i;										    \
296 			 return mRanks;															        \
297 		}																					\
298 	}																						\
299 	else																					\
300 	{																						\
301 		/* Prepare for temporal coherence */												\
302 		const T* Indices = mRanks;                                                          \
303 		type PrevVal = (type)buffer[*Indices];												\
304 																							\
305 		while(p!=pe)																		\
306 		{																					\
307 			/* Read input buffer in previous sorted order */								\
308 			const type Val = (type)buffer[*Indices++];										\
309 			/* Check whether already sorted or not */										\
310 			if(Val<PrevVal)	{ AlreadySorted = false; break; } /* Early out */				\
311 			/* Update for next iteration */													\
312 			PrevVal = Val;																	\
313 																							\
314 			/* Create histograms */															\
315 			h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;									\
316 			h4[*p++]++;	h5[*p++]++;	h6[*p++]++;	h7[*p++]++;									\
317 		}																					\
318 																							\
319 		/* If all input values are already sorted, we just have to return and leave the */	\
320 		/* previous list unchanged. That way the routine may take advantage of temporal */	\
321 		/* coherence, for example when used to sort transparent faces.					*/	\
322 		if(AlreadySorted)  return mRanks;                  									\
323 	}																						\
324 																							\
325 	/* Else there has been an early out and we must finish computing the histograms */  \
326   while(p!=pe)                   \
327   {                      \
328     /* Create histograms without the previous overhead */        \
329       h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++;         \
330       h4[*p++]++; h5[*p++]++; h6[*p++]++; h7[*p++]++;         \
331     }
332   // the optimization in case of 'already sorted (sub)array' cannot hold for floats and double
333   // as the NaNs do NOT compare to anything, and the 'already sorted' would be wrong.
334 
335 #define CREATE_HISTOGRAMSFLOAT(type, buffer)													\
336 	/* Clear counters/histograms */															\
337 	memset(Histogram, 0, 256*4*sizeof(T));                                             	\
338 																							\
339 	/* Prepare to count */																	\
340 	const DByte* p = (const DByte*)input;													\
341 	const DByte* pe = &p[nb*4];																\
342 	T* h0 = &Histogram[H0_OFFSET4];	/* Histogram for first pass (LSB)	*/                  \
343 	T* h1 = &Histogram[H1_OFFSET4];	/* Histogram for second pass		*/              	\
344 	T* h2 = &Histogram[H2_OFFSET4];	/* Histogram for third pass			*/              	\
345 	T* h3 = &Histogram[H3_OFFSET4];	/* Histogram for last pass (MSB)	*/                	\
346 																							\
347 	bool AlreadySorted = true;	/* Optimism... */											\
348 																							\
349 	if(ranksUnInitialized)																    \
350 	{																						\
351 		/* Prepare for temporal coherence */												\
352 		const type* Running = (type*)buffer;												\
353 		type PrevVal = *Running;															\
354 		if (std::isnan(PrevVal)) {AlreadySorted=false;} else {						\
355 		while(p!=pe)																		\
356 		{																					\
357 			/* Read input buffer in previous sorted order */								\
358 			const type Val = *Running++;													\
359 			/* Check whether already sorted or not, care of Nans if necessary*/										\
360       if(Val<PrevVal||std::isnan(Val))	{ AlreadySorted = false; break; } /* Early out */				\
361 			/* Update for next iteration */													\
362 			PrevVal = Val;																	\
363 																							\
364 			/* Create histograms */															\
365 			h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;									\
366 		}																					\
367 		}																					\
368 		/* If all input values are already sorted, we just have to return and leave the */	\
369 		/* previous list unchanged. That way the routine may take advantage of temporal */	\
370 		/* coherence, for example when used to sort transparent faces.					*/	\
371 		if(AlreadySorted)																	\
372 		{																					\
373 			for(SizeT i=0;i<nb;++i)	mRanks[i] = i;										    \
374 			return mRanks;															        \
375 		}																					\
376 	}																						\
377 	else																					\
378 	{																						\
379 		/* Prepare for temporal coherence */												\
380 		const T* Indices = mRanks;                                                        	\
381 		type PrevVal = (type)buffer[*Indices];												\
382 																							\
383 		while(p!=pe)																		\
384 		{																					\
385 			/* Read input buffer in previous sorted order */								\
386 			const type Val = (type)buffer[*Indices++];										\
387 			/* Check whether already sorted or not */										\
388 			if(Val<PrevVal)	{ AlreadySorted = false; break; } /* Early out */				\
389 			/* Update for next iteration */													\
390 			PrevVal = Val;																	\
391 																							\
392 			/* Create histograms */															\
393 			h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;									\
394 		}																					\
395 																							\
396 		/* If all input values are already sorted, we just have to return and leave the */	\
397 		/* previous list unchanged. That way the routine may take advantage of temporal */	\
398 		/* coherence, for example when used to sort transparent faces.					*/	\
399 		if(AlreadySorted) return mRanks;                   									\
400 	}																						\
401 																							\
402 	/* Else there has been an early out and we must finish computing the histograms */		\
403 	while(p!=pe)																			\
404 	{																						\
405 		/* Create histograms without the previous overhead */								\
406 		h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;										\
407     }
408 
409 #define CREATE_HISTOGRAMSDOUBLE(type, buffer)													\
410 	/* Clear counters/histograms */															\
411 	memset(Histogram, 0, 256*8*sizeof(T));                                                  \
412 																							\
413 	/* Prepare to count */																	\
414 	const DByte* p = (const DByte*)input;													\
415 	const DByte* pe = &p[nb*8];																\
416 	T* h0 = &Histogram[H0_OFFSET8];	/* Histogram for first pass (LSB)	*/                  \
417 	T* h1 = &Histogram[H1_OFFSET8];	/*                                  */                  \
418 	T* h2 = &Histogram[H2_OFFSET8];	/*                                  */                  \
419           T* h3 = &Histogram[H3_OFFSET8];	/*                                  */			\
420           T* h4 = &Histogram[H4_OFFSET8];	/*                                  */			\
421           T* h5 = &Histogram[H5_OFFSET8];	/*                                  */			\
422           T* h6 = &Histogram[H6_OFFSET8];	/*                                  */			\
423           T* h7 = &Histogram[H7_OFFSET8];	/* Histogram for last pass (MSB)	*/			\
424 																							\
425 	bool AlreadySorted = true;	/* Optimism... */											\
426 																							\
427 	if(ranksUnInitialized)																    \
428 	{																						\
429 		/* Prepare for temporal coherence */												\
430 		const type* Running = (type*)buffer;												\
431 		type PrevVal = *Running;															\
432 		if (std::isnan(PrevVal)) {AlreadySorted=false;} else {						\
433 		while(p!=pe)																		\
434 		{																					\
435 			/* Read input buffer in previous sorted order */								\
436 			const type Val = *Running++;													\
437 			/* Check whether already sorted or not, care of Nans if necessary */										\
438 			if (Val<PrevVal||std::isnan(Val))	{ AlreadySorted = false; break; } /* Early out */\
439 			/* Update for next iteration */													\
440 			PrevVal = Val;																	\
441 																							\
442 			/* Create histograms */															\
443 			h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;									\
444 			h4[*p++]++;	h5[*p++]++;	h6[*p++]++;	h7[*p++]++;									\
445 		}																					\
446 		}																					\
447 		/* If all input values are already sorted, we just have to return and leave the */	\
448 		/* previous list unchanged. That way the routine may take advantage of temporal */	\
449 		/* coherence, for example when used to sort transparent faces.					*/	\
450 		if(AlreadySorted)																	\
451 		{																					\
452 			for(SizeT i=0;i<nb;++i)	mRanks[i] = i;										    \
453 			 return mRanks;															        \
454 		}																					\
455 	}																						\
456 	else																					\
457 	{																						\
458 		/* Prepare for temporal coherence */												\
459 		const T* Indices = mRanks;                                                          \
460 		type PrevVal = (type)buffer[*Indices];												\
461 																							\
462 		while(p!=pe)																		\
463 		{																					\
464 			/* Read input buffer in previous sorted order */								\
465 			const type Val = (type)buffer[*Indices++];										\
466 			/* Check whether already sorted or not */										\
467 			if(Val<PrevVal)	{ AlreadySorted = false; break; } /* Early out */				\
468 			/* Update for next iteration */													\
469 			PrevVal = Val;																	\
470 																							\
471 			/* Create histograms */															\
472 			h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;									\
473 			h4[*p++]++;	h5[*p++]++;	h6[*p++]++;	h7[*p++]++;									\
474 		}																					\
475 																							\
476 		/* If all input values are already sorted, we just have to return and leave the */	\
477 		/* previous list unchanged. That way the routine may take advantage of temporal */	\
478 		/* coherence, for example when used to sort transparent faces.					*/	\
479 		if(AlreadySorted)  return mRanks;                  									\
480 	}																						\
481 																							\
482 	/* Else there has been an early out and we must finish computing the histograms */		\
483 	while(p!=pe)																			\
484 	{																						\
485 		/* Create histograms without the previous overhead */								\
486 			h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;									\
487 			h4[*p++]++;	h5[*p++]++;	h6[*p++]++;	h7[*p++]++;									\
488     }
489 
490 #define CHECK_PASS_VALIDITY(pass)															\
491 	/* Shortcut to current counters */														\
492 	const T* CurCount = &Histogram[pass<<8];                                                \
493 																							\
494 	/* Reset flag. The sorting pass is supposed to be performed. (default) */				\
495 	bool PerformPass = true;																\
496 																							\
497 	/* Check pass validity */																\
498 																							\
499 	/* If all values have the same byte, sorting is useless. */								\
500 	/* It may happen when sorting bytes or words instead of dwords. */						\
501 	/* This routine actually sorts words faster than dwords, and bytes */					\
502 	/* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */				\
503 	/* for words and O(n) for bytes. Running time for floats depends on actual values... */	\
504 																							\
505 	/* Get first byte */																	\
506 	const DByte UniqueVal = *(((DByte*)input)+pass);										\
507 																							\
508 	/* Check that byte's counter */               \
509     if(CurCount[UniqueVal]==nb) PerformPass=false;
510 
511   template<typename T>
RadixSort(const double * inputDouble,SizeT nb)512  static T* RadixSort(const double* inputDouble, SizeT nb)
513 {
514 	const DULong64* input = (const DULong64*)inputDouble;
515     T* mRanks=(T*)gdlAlignedMalloc(nb*sizeof(T));
516     T* mRanks2=(T*)gdlAlignedMalloc(nb*sizeof(T));
517 	// Allocate histograms & offsets on the stack
518 	T Histogram[256*8];
519 	T* Link[256];
520     bool ranksUnInitialized=true;
521 
522 	{ CREATE_HISTOGRAMSDOUBLE(double, inputDouble); }
523 
524 	// Radix sort, j is the pass number (0=LSB, 7=MSB)
525 	for(int j=0;j<8;++j)
526 	{
527 		// Should we care about negative values?
528 		if(j!=7)
529 		{
530 			// Here we deal with positive values only
531 			CHECK_PASS_VALIDITY(j);
532 
533 			if(PerformPass)
534 			{
535 				// Create offsets
536 				Link[0] = mRanks2;
537 				for(SizeT i=1;i<256;++i)		Link[i] = Link[i-1] + CurCount[i-1];
538 
539 				// Perform Radix Sort
540 				const DByte* InputBytes = (const DByte*)input;
541 				InputBytes += BYTES_INC8;
542 				if(ranksUnInitialized)
543 				{
544 					for(SizeT i=0;i<nb;++i)	*Link[InputBytes[i<<3]]++ = i;
545 					ranksUnInitialized=false;
546 				}
547 				else
548 				{
549 					const T* Indices		= mRanks;
550 					const T* IndicesEnd	= &mRanks[nb];
551 					while(Indices!=IndicesEnd)
552 					{
553 						const T id = *Indices++;
554 						*Link[InputBytes[id<<3]]++ = id;
555 					}
556 				}
557 
558 				// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
559 				T* Tmp = mRanks;
560 				mRanks = mRanks2;
561 				mRanks2 = Tmp;
562 			}
563 		}
564 		else
565 		{
566 			// This is a special case to correctly handle negative values
567 			CHECK_PASS_VALIDITY(j);
568 
569 			if(PerformPass)
570 			{
571 				// Compute #negative values involved if needed
572 				T NbNegativeValues = 0;
573 				// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
574 				// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
575 				// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
576 				// ### is that ok on Apple ?!
577 				const T* h7 = &Histogram[H7_OFFSET8];
578 				for(int i=128;i<256;++i)	NbNegativeValues += h7[i];	// 768 for last histogram, 128 for negative part
579 
580 				// Create biased offsets, in order for negative numbers to be sorted as well
581 				Link[0] = &mRanks2[NbNegativeValues];										// First positive number takes place after the negative ones
582 				for(int i=1;i<128;++i)		Link[i] = Link[i-1] + CurCount[i-1];		// 1 to 128 for positive numbers
583 
584 				// We must reverse the sorting order for negative numbers!
585 				Link[255] = mRanks2;
586 				for(int i=0;i<127;++i)	Link[254-i] = Link[255-i] + CurCount[255-i];	// Fixing the wrong order for negative values
587 				for(int i=128;i<256;++i)	Link[i] += CurCount[i];							// Fixing the wrong place for negative values
588 
589 				// Perform Radix Sort
590 				if(ranksUnInitialized)
591 				{
592 					for(SizeT i=0;i<nb;++i)
593 					{
594 						const T Radix = input[i]>>56;			// Radix byte, same as above. AND is useless here (udword).
595 						// ### cmp to be killed. Not good. Later.
596 						if(Radix<128)		*Link[Radix]++ = i;		// Number is positive, same as above
597 						else				*(--Link[Radix]) = i;	// Number is negative, flip the sorting order
598 					}
599 					ranksUnInitialized=false;
600 				}
601 				else
602 				{
603 					for(SizeT i=0;i<nb;++i)
604 					{
605 						const T Radix = input[mRanks[i]]>>56;			// Radix byte, same as above. AND is useless here (udword).
606 						// ### cmp to be killed. Not good. Later.
607 						if(Radix<128)		*Link[Radix]++ = mRanks[i];		// Number is positive, same as above
608 						else				*(--Link[Radix]) = mRanks[i];	// Number is negative, flip the sorting order
609 					}
610 				}
611 				// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
612 				T* Tmp = mRanks;
613 				mRanks = mRanks2;
614 				mRanks2 = Tmp;
615 			}
616 			else
617 			{
618 				// The pass is useless, yet we still have to reverse the order of current list if all values are negative.
619 				if(UniqueVal>=128)
620 				{
621 					if(ranksUnInitialized)
622 					{
623 						// ###Possible?
624 						for(SizeT i=0;i<nb;++i)	mRanks2[i] = nb-i-1;
625 						ranksUnInitialized=false;
626 					}
627 					else
628 					{
629 						for(SizeT i=0;i<nb;++i)	mRanks2[i] = mRanks[nb-i-1];
630 					}
631 
632 					// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
633 					T* Tmp = mRanks;
634 					mRanks = mRanks2;
635 					mRanks2 = Tmp;
636 				}
637 			}
638 		}
639 	}
640     gdlAlignedFree(mRanks2);
641     return mRanks;
642 }
643 
644 template<typename T>
RadixSort(const float * inputFloat,SizeT nb)645  static T* RadixSort(const float* inputFloat, SizeT nb)
646 {
647 	const DULong* input = (const DULong*)inputFloat;
648     T* mRanks=(T*)gdlAlignedMalloc(nb*sizeof(T));
649     T* mRanks2=(T*)gdlAlignedMalloc(nb*sizeof(T));
650 	// Allocate histograms & offsets on the stack
651 	T Histogram[256*4];
652 	T* Link[256];
653     bool ranksUnInitialized=true;
654 
655 	// Create histograms (counters). Counters for all passes are created in one run.
656 	// Pros:	read input buffer once instead of four times
657 	// Cons:	mHistogram is 4Kb instead of 1Kb
658 	// Floating-point values are always supposed to be signed values, so there's only one code path there.
659 	// Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
660 	// is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
661 	// generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
662 	// wouldn't work with mixed positive/negative values....
663 	{ CREATE_HISTOGRAMSFLOAT(float, inputFloat); }
664 
665 	// Radix sort, j is the pass number (0=LSB, 3=MSB)
666 	for(int j=0;j<4;++j)
667 	{
668 		// Should we care about negative values?
669 		if(j!=3)
670 		{
671 			// Here we deal with positive values only
672 			CHECK_PASS_VALIDITY(j);
673 
674 			if(PerformPass)
675 			{
676 				// Create offsets
677 				Link[0] = mRanks2;
678 				for(SizeT i=1;i<256;++i)		Link[i] = Link[i-1] + CurCount[i-1];
679 
680 				// Perform Radix Sort
681 				const DByte* InputBytes = (const DByte*)input;
682 				InputBytes += BYTES_INC4;
683 				if(ranksUnInitialized)
684 				{
685 					for(SizeT i=0;i<nb;++i)	*Link[InputBytes[i<<2]]++ = i;
686 					ranksUnInitialized=false;
687 				}
688 				else
689 				{
690 					const T* Indices		= mRanks;
691 					const T* IndicesEnd	= &mRanks[nb];
692 					while(Indices!=IndicesEnd)
693 					{
694 						const T id = *Indices++;
695 						*Link[InputBytes[id<<2]]++ = id;
696 					}
697 				}
698 
699 				// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
700 				T* Tmp = mRanks;
701 				mRanks = mRanks2;
702 				mRanks2 = Tmp;
703 			}
704 		}
705 		else
706 		{
707 			// This is a special case to correctly handle negative values
708 			CHECK_PASS_VALIDITY(j);
709 
710 			if(PerformPass)
711 			{
712 				// Compute #negative values involved if needed
713 				T NbNegativeValues = 0;
714 				// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
715 				// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
716 				// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
717 				// ### is that ok on Apple ?!
718 				const T* h3 = &Histogram[H3_OFFSET4];
719 				for(int i=128;i<256;++i)	NbNegativeValues += h3[i];	// 768 for last histogram, 128 for negative part
720 
721 				// Create biased offsets, in order for negative numbers to be sorted as well
722 				Link[0] = &mRanks2[NbNegativeValues];										// First positive number takes place after the negative ones
723 				for(int i=1;i<128;++i)		Link[i] = Link[i-1] + CurCount[i-1];		// 1 to 128 for positive numbers
724 
725 				// We must reverse the sorting order for negative numbers!
726 				Link[255] = mRanks2;
727 				for(int i=0;i<127;++i)	Link[254-i] = Link[255-i] + CurCount[255-i];	// Fixing the wrong order for negative values
728 				for(int i=128;i<256;++i)	Link[i] += CurCount[i];							// Fixing the wrong place for negative values
729 
730 				// Perform Radix Sort
731 				if(ranksUnInitialized)
732 				{
733 					for(SizeT i=0;i<nb;++i)
734 					{
735 						const DULong Radix = input[i]>>24;			// Radix byte, same as above. AND is useless here (udword).
736 						// ### cmp to be killed. Not good. Later.
737 						if(Radix<128)		*Link[Radix]++ = i;		// Number is positive, same as above
738 						else				*(--Link[Radix]) = i;	// Number is negative, flip the sorting order
739 					}
740 					ranksUnInitialized=false;
741 				}
742 				else
743 				{
744 					for(SizeT i=0;i<nb;++i)
745 					{
746 						const DULong Radix = input[mRanks[i]]>>24;			// Radix byte, same as above. AND is useless here (udword).
747 						// ### cmp to be killed. Not good. Later.
748 						if(Radix<128)		*Link[Radix]++ = mRanks[i];		// Number is positive, same as above
749 						else				*(--Link[Radix]) = mRanks[i];	// Number is negative, flip the sorting order
750 					}
751 				}
752 				// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
753 				T* Tmp = mRanks;
754 				mRanks = mRanks2;
755 				mRanks2 = Tmp;
756 			}
757 			else
758 			{
759 				// The pass is useless, yet we still have to reverse the order of current list if all values are negative.
760 				if(UniqueVal>=128)
761 				{
762 					if(ranksUnInitialized)
763 					{
764 						// ###Possible?
765 						for(SizeT i=0;i<nb;++i)	mRanks2[i] = nb-i-1;
766 						ranksUnInitialized=false;
767 					}
768 					else
769 					{
770 						for(SizeT i=0;i<nb;++i)	mRanks2[i] = mRanks[nb-i-1];
771 					}
772 
773 					// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
774 					T* Tmp = mRanks;
775 					mRanks = mRanks2;
776 					mRanks2 = Tmp;
777 				}
778 			}
779 		}
780 	}
781     gdlAlignedFree(mRanks2);
782     return mRanks;
783 }
784 
785 template<typename T>
RadixSort(const DLong64 * input,SizeT nb)786  static T* RadixSort(const DLong64* input, SizeT nb)
787 {
788     T* mRanks=(T*)gdlAlignedMalloc(nb*sizeof(T));
789     T* mRanks2=(T*)gdlAlignedMalloc(nb*sizeof(T));
790 	// Allocate histograms & offsets on the stack
791 	T Histogram[256*8];
792 	T* Link[256];
793     bool ranksUnInitialized=true;
794 
795     CREATE_HISTOGRAMS8(DLong64, input);
796 
797 	// Radix sort, j is the pass number (0=LSB, 7=MSB)
798 	for(int j=0;j<8;++j)
799 	{
800 		CHECK_PASS_VALIDITY(j);
801 
802 		// Sometimes the last (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
803 		// not a problem, numbers are correctly sorted anyway.
804 		if(PerformPass)
805 		{
806 			// Should we care about negative values?
807 			if(j!=7)
808 			{
809 				// Here we deal with positive values only
810 
811 				// Create offsets
812 				Link[0] = mRanks2;
813 				for(int i=1;i<256;++i)		Link[i] = Link[i-1] + CurCount[i-1];
814 			}
815 			else
816 			{
817 				// This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
818 
819 				// Compute #negative values involved if needed
820 				T NbNegativeValues = 0;
821                 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
822                 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
823                 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
824                 const T* h7 = &Histogram[H7_OFFSET8];
825                 for(int i=128;i<256;++i)	NbNegativeValues += h7[i];	// 768 for last histogram, 128 for negative part
826 
827 				// Create biased offsets, in order for negative numbers to be sorted as well
828 				Link[0] = &mRanks2[NbNegativeValues];										// First positive number takes place after the negative ones
829 				for(int i=1;i<128;++i)		Link[i] = Link[i-1] + CurCount[i-1];		// 1 to 128 for positive numbers
830 
831 				// Fixing the wrong place for negative values
832 				Link[128] = mRanks2;
833 				for(int i=129;i<256;++i)		Link[i] = Link[i-1] + CurCount[i-1];
834 			}
835 
836 			// Perform Radix Sort
837 			const DByte* InputBytes	= (const DByte*)input;
838 			InputBytes += BYTES_INC8;
839 			if(ranksUnInitialized)
840 			{
841 				for(SizeT i=0;i<nb;++i)	*Link[InputBytes[i<<3]]++ = i;
842 				ranksUnInitialized=false;
843 			}
844 			else
845 			{
846 				const T* Indices		= mRanks;
847 				const T* IndicesEnd	= &mRanks[nb];
848 				while(Indices!=IndicesEnd)
849 				{
850 					const T id = *Indices++;
851 					*Link[InputBytes[id<<3]]++ = id;
852 				}
853 			}
854 
855 			// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
856 			T* Tmp = mRanks;
857 			mRanks = mRanks2;
858 			mRanks2 = Tmp;
859 		}
860 	}
861     gdlAlignedFree(mRanks2);
862     return mRanks;
863 }
864 
865 template<typename T>
RadixSort(const DULong64 * input,SizeT nb)866  static T* RadixSort(const DULong64* input, SizeT nb)
867 {
868     T* mRanks=(T*)gdlAlignedMalloc(nb*sizeof(T));
869     T* mRanks2=(T*)gdlAlignedMalloc(nb*sizeof(T));
870 	// Allocate histograms & offsets on the stack
871 	T Histogram[256*8];
872 	T* Link[256];
873     bool ranksUnInitialized=true;
874 
875     CREATE_HISTOGRAMS8(DULong64, input);
876 
877 	// Radix sort, j is the pass number (0=LSB, 7=MSB)
878 	for(int j=0;j<8;++j)
879 	{
880 		CHECK_PASS_VALIDITY(j);
881 
882 		// Sometimes the last (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
883 		// not a problem, numbers are correctly sorted anyway.
884 		if(PerformPass)
885 		{
886             // Here we deal with positive values only
887 
888             // Create offsets
889             Link[0] = mRanks2;
890             for(int i=1;i<256;++i)		Link[i] = Link[i-1] + CurCount[i-1];
891 
892 			// Perform Radix Sort
893 			const DByte* InputBytes	= (const DByte*)input;
894 			InputBytes += BYTES_INC8;
895 			if(ranksUnInitialized)
896 			{
897 				for(SizeT i=0;i<nb;++i)	*Link[InputBytes[i<<3]]++ = i;
898 				ranksUnInitialized=false;
899 			}
900 			else
901 			{
902 				const T* Indices		= mRanks;
903 				const T* IndicesEnd	= &mRanks[nb];
904 				while(Indices!=IndicesEnd)
905 				{
906 					const T id = *Indices++;
907 					*Link[InputBytes[id<<3]]++ = id;
908 				}
909 			}
910 
911 			// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
912 			T* Tmp = mRanks;
913 			mRanks = mRanks2;
914 			mRanks2 = Tmp;
915 		}
916 	}
917     gdlAlignedFree(mRanks2);
918     return mRanks;
919 }
920 
921 template<typename T>
RadixSort(const DLong * input,SizeT nb)922  static T* RadixSort(const DLong* input, SizeT nb)
923 {
924     T* mRanks=(T*)gdlAlignedMalloc(nb*sizeof(T));
925     T* mRanks2=(T*)gdlAlignedMalloc(nb*sizeof(T));
926 	// Allocate histograms & offsets on the stack
927 	T Histogram[256*4];
928 	T* Link[256];
929     bool ranksUnInitialized=true;
930 
931 	// Create histograms (counters). Counters for all passes are created in one run.
932 	// Pros:	read input buffer once instead of four times
933 	// Cons:	mHistogram is 4Kb instead of 1Kb
934 	// We must take care of signed/unsigned values for temporal coherence.... I just
935 	// have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
936 
937     CREATE_HISTOGRAMS4(DLong, input);
938 
939 	// Radix sort, j is the pass number (0=LSB, 3=MSB)
940 	for(int j=0;j<4;++j)
941 	{
942 		CHECK_PASS_VALIDITY(j);
943 
944 		// Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
945 		// not a problem, numbers are correctly sorted anyway.
946 		if(PerformPass)
947 		{
948 			// Should we care about negative values?
949 			if(j!=3)
950 			{
951 				// Here we deal with positive values only
952 
953 				// Create offsets
954 				Link[0] = mRanks2;
955 				for(int i=1;i<256;++i)		Link[i] = Link[i-1] + CurCount[i-1];
956 			}
957 			else
958 			{
959 				// This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
960 
961 				// Compute #negative values involved if needed
962 				T NbNegativeValues = 0;
963                 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
964                 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
965                 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
966                 const T* h3 = &Histogram[H3_OFFSET4];
967                 for(int i=128;i<256;++i)	NbNegativeValues += h3[i];	// 768 for last histogram, 128 for negative part
968 
969 				// Create biased offsets, in order for negative numbers to be sorted as well
970 				Link[0] = &mRanks2[NbNegativeValues];										// First positive number takes place after the negative ones
971 				for(int i=1;i<128;++i)		Link[i] = Link[i-1] + CurCount[i-1];		// 1 to 128 for positive numbers
972 
973 				// Fixing the wrong place for negative values
974 				Link[128] = mRanks2;
975 				for(int i=129;i<256;++i)		Link[i] = Link[i-1] + CurCount[i-1];
976 			}
977 
978 			// Perform Radix Sort
979 			const DByte* InputBytes	= (const DByte*)input;
980 			InputBytes += BYTES_INC4;
981 			if(ranksUnInitialized)
982 			{
983 				for(SizeT i=0;i<nb;++i)	*Link[InputBytes[i<<2]]++ = i;
984 				ranksUnInitialized=false;
985 			}
986 			else
987 			{
988 				const T* Indices		= mRanks;
989 				const T* IndicesEnd	= &mRanks[nb];
990 				while(Indices!=IndicesEnd)
991 				{
992 					const T id = *Indices++;
993 					*Link[InputBytes[id<<2]]++ = id;
994 				}
995 			}
996 
997 			// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
998 			T* Tmp = mRanks;
999 			mRanks = mRanks2;
1000 			mRanks2 = Tmp;
1001 		}
1002 	}
1003     gdlAlignedFree(mRanks2);
1004     return mRanks;
1005 }
1006 
1007 template<typename T>
RadixSort(const DULong * input,SizeT nb)1008  static T* RadixSort(const DULong* input, SizeT nb)
1009 {
1010     T* mRanks=(T*)gdlAlignedMalloc(nb*sizeof(T));
1011     T* mRanks2=(T*)gdlAlignedMalloc(nb*sizeof(T));
1012 	// Allocate histograms & offsets on the stack
1013 	T Histogram[256*4];
1014 	T* Link[256];
1015     bool ranksUnInitialized=true;
1016 
1017 	// Create histograms (counters). Counters for all passes are created in one run.
1018 	// Pros:	read input buffer once instead of four times
1019 	// Cons:	mHistogram is 4Kb instead of 1Kb
1020 	// We must take care of signed/unsigned values for temporal coherence.... I just
1021 	// have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
1022 
1023     CREATE_HISTOGRAMS4(DULong, input);
1024 
1025 	// Radix sort, j is the pass number (0=LSB, 3=MSB)
1026 	for(int j=0;j<4;++j)
1027 	{
1028 		CHECK_PASS_VALIDITY(j);
1029 
1030 		// Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
1031 		// not a problem, numbers are correctly sorted anyway.
1032 		if(PerformPass)
1033 		{
1034             // Here we deal with positive values only
1035 
1036             // Create offsets
1037             Link[0] = mRanks2;
1038             for(int i=1;i<256;++i)		Link[i] = Link[i-1] + CurCount[i-1];
1039 
1040 			// Perform Radix Sort
1041 			const DByte* InputBytes	= (const DByte*)input;
1042 			InputBytes += BYTES_INC4;
1043 			if(ranksUnInitialized)
1044 			{
1045 				for(SizeT i=0;i<nb;++i)	*Link[InputBytes[i<<2]]++ = i;
1046 				ranksUnInitialized=false;
1047 			}
1048 			else
1049 			{
1050 				const T* Indices		= mRanks;
1051 				const T* IndicesEnd	= &mRanks[nb];
1052 				while(Indices!=IndicesEnd)
1053 				{
1054 					const T id = *Indices++;
1055 					*Link[InputBytes[id<<2]]++ = id;
1056 				}
1057 			}
1058 
1059 			// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
1060 			T* Tmp = mRanks;
1061 			mRanks = mRanks2;
1062 			mRanks2 = Tmp;
1063 		}
1064 	}
1065     gdlAlignedFree(mRanks2);
1066     return mRanks;
1067 }
1068 
1069 template<typename T>
RadixSort(const DInt * input,SizeT nb)1070  static T* RadixSort(const DInt* input, SizeT nb)
1071 {
1072     T* mRanks=(T*)gdlAlignedMalloc(nb*sizeof(T));
1073     T* mRanks2=(T*)gdlAlignedMalloc(nb*sizeof(T));
1074 	// Allocate histograms & offsets on the stack
1075 	T Histogram[256*2];
1076 	T* Link[256];
1077     bool ranksUnInitialized=true;
1078 
1079     CREATE_HISTOGRAMS2(DInt, input);
1080 
1081 	// Radix sort, j is the pass number (0=LSB, 1=MSB)
1082 	for(int j=0;j<2;++j)
1083 	{
1084 		CHECK_PASS_VALIDITY(j);
1085 
1086 		if(PerformPass)
1087 		{
1088 			// Should we care about negative values?
1089 			if(j!=1)
1090 			{
1091 				// Here we deal with positive values only
1092 				// Create offsets
1093 				Link[0] = mRanks2;
1094 				for(int i=1;i<256;++i)		Link[i] = Link[i-1] + CurCount[i-1];
1095 			}
1096 			else
1097 			{
1098 				// This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
1099 				// Compute #negative values involved if needed
1100 				T NbNegativeValues = 0;
1101                 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
1102                 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
1103                 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
1104                 const T* h1 = &Histogram[H1_OFFSET2];
1105                 for(int i=128;i<256;++i)	NbNegativeValues += h1[i];	// 768 for last histogram, 128 for negative part
1106 
1107 				// Create biased offsets, in order for negative numbers to be sorted as well
1108 				Link[0] = &mRanks2[NbNegativeValues];										// First positive number takes place after the negative ones
1109 				for(int i=1;i<128;++i)		Link[i] = Link[i-1] + CurCount[i-1];		// 1 to 128 for positive numbers
1110 
1111 				// Fixing the wrong place for negative values
1112 				Link[128] = mRanks2;
1113 				for(int i=129;i<256;++i)		Link[i] = Link[i-1] + CurCount[i-1];
1114 			}
1115 
1116 			// Perform Radix Sort
1117 			const DByte* InputBytes	= (const DByte*)input;
1118 			InputBytes += BYTES_INC2;
1119 			if(ranksUnInitialized)
1120 			{
1121 				for(SizeT i=0;i<nb;++i)	*Link[InputBytes[i<<1]]++ = i;
1122 				ranksUnInitialized=false;
1123 			}
1124 			else
1125 			{
1126 				const T* Indices		= mRanks;
1127 				const T* IndicesEnd	= &mRanks[nb];
1128 				while(Indices!=IndicesEnd)
1129 				{
1130 					const T id = *Indices++;
1131 					*Link[InputBytes[id<<1]]++ = id;
1132 				}
1133 			}
1134 
1135 			// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
1136 			T* Tmp = mRanks;
1137 			mRanks = mRanks2;
1138 			mRanks2 = Tmp;
1139 		}
1140 	}
1141     gdlAlignedFree(mRanks2);
1142     return mRanks;
1143 }
1144 
1145 template<typename T>
RadixSort(const DUInt * input,SizeT nb)1146  static T* RadixSort(const DUInt* input, SizeT nb)
1147 {
1148     T* mRanks=(T*)gdlAlignedMalloc(nb*sizeof(T));
1149     T* mRanks2=(T*)gdlAlignedMalloc(nb*sizeof(T));
1150 	// Allocate histograms & offsets on the stack
1151 	T Histogram[256*2];
1152 	T* Link[256];
1153     bool ranksUnInitialized=true;
1154 
1155     CREATE_HISTOGRAMS2(DUInt, input);
1156 
1157 	// Radix sort, j is the pass number (0=LSB, 1=MSB)
1158 	for(int j=0;j<2;++j)
1159 	{
1160 		CHECK_PASS_VALIDITY(j);
1161 
1162 		if(PerformPass)
1163 		{
1164             // Here we deal with positive values only
1165 
1166             // Create offsets
1167             Link[0] = mRanks2;
1168             for(int i=1;i<256;++i)		Link[i] = Link[i-1] + CurCount[i-1];
1169 
1170 			// Perform Radix Sort
1171 			const DByte* InputBytes	= (const DByte*)input;
1172 			InputBytes += BYTES_INC2;
1173 			if(ranksUnInitialized)
1174 			{
1175 				for(SizeT i=0;i<nb;++i)	*Link[InputBytes[i<<1]]++ = i;
1176 				ranksUnInitialized=false;
1177 			}
1178 			else
1179 			{
1180 				const T* Indices		= mRanks;
1181 				const T* IndicesEnd	= &mRanks[nb];
1182 				while(Indices!=IndicesEnd)
1183 				{
1184 					const T id = *Indices++;
1185 					*Link[InputBytes[id<<1]]++ = id;
1186 				}
1187 			}
1188 
1189 			// Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
1190 			T* Tmp = mRanks;
1191 			mRanks = mRanks2;
1192 			mRanks2 = Tmp;
1193 		}
1194 	}
1195     gdlAlignedFree(mRanks2);
1196     return mRanks;
1197 }
1198 
1199 template<typename T>
RadixSort(const DByte * input,SizeT nb)1200  static T* RadixSort(const DByte* input, SizeT nb)
1201 {
1202 	// Allocate histograms & offsets on the stack
1203     T* mRanks=(T*)gdlAlignedMalloc(nb*sizeof(T));
1204 	T Histogram[256];
1205 	T* Link[256];
1206 
1207     /* Clear counters/histograms */
1208     memset(Histogram, 0, 256 * sizeof (T));
1209 
1210     /* Prepare to count */
1211     const DByte* p = (const DByte*) input;
1212     const DByte* pe = &p[nb];
1213     T* h0 = Histogram;
1214 
1215     bool AlreadySorted = true; /* Optimism... */
1216     for (SizeT i = 0; i < nb; ++i) mRanks[i] = i;
1217 
1218     /* Prepare for temporal coherence */
1219     const DByte* Running = (DByte*) input;
1220     DByte PrevVal = *Running;
1221 
1222     while (p != pe) {
1223       /* Read input buffer in previous sorted order */
1224       const DByte Val = *Running++;
1225       /* Check whether already sorted or not */
1226       if (Val < PrevVal) {
1227         AlreadySorted = false;
1228         break;
1229       } /* Early out */
1230       /* Update for next iteration */
1231       PrevVal = Val;
1232 
1233       /* Create histograms */
1234       h0[*p++]++;
1235     }
1236 
1237     /* If all input values are already sorted, we just have to return and leave the */
1238     /* previous list unchanged. That way the routine may take advantage of temporal */
1239     /* coherence, for example when used to sort transparent faces.					*/
1240     if (AlreadySorted) return mRanks;
1241 
1242     /* Else there has been an early out and we must finish computing the histograms */
1243     while (p != pe) {
1244       /* Create histograms without the previous overhead */
1245       h0[*p++]++;
1246     }
1247 
1248     /* If all values have the same byte, sorting is useless. */
1249     /* It may happen when sorting bytes or words instead of dwords. */
1250     /* This routine actually sorts words faster than dwords, and bytes */
1251     /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */
1252     /* for words and O(n) for bytes. Running time for floats depends on actual values... */
1253 
1254     /* Get first byte */
1255     const DByte UniqueVal = *((DByte*) input);
1256 
1257     /* Check that byte's counter */
1258     if (Histogram[UniqueVal] == nb) return mRanks; //already sorted too
1259 
1260     // Here we deal with positive values only
1261     // Create offsets
1262     Link[0] = mRanks;
1263     for(int i=1;i<256;++i) Link[i] = Link[i-1] + Histogram[i-1];
1264 
1265     // Perform Radix Sort
1266     const DByte* InputBytes	= (const DByte*)input;
1267     for(SizeT i=0;i<nb;++i)	*Link[InputBytes[i]]++ = i;
1268     return mRanks;
1269 }
1270 
1271 
1272   template<typename T, typename Q>
RadixSortIndex(const Q * val,T * index,SizeT lo,SizeT hi)1273    static void RadixSortIndex(const Q* val, T* index, SizeT lo, SizeT hi)
1274   {
1275     SizeT length = hi - lo + 1;
1276     T* res = RadixSort<T>(&(val[lo]), length); //sorted indexes are between 0 and length-1
1277     for (SizeT i=0; i< length; ++i) index[lo+i]=res[i]+lo;  //we need them between lo and hi...
1278     gdlAlignedFree(res);
1279   }
1280 
1281 //-------------------------------------
1282 
1283 
1284 template<typename T>
less(T & v,T & w)1285 inline bool less (T &v, T &w)
1286 {
1287     return (v < w);
1288 }
1289 
1290 template<>
less(DFloat & v,DFloat & w)1291 inline bool less (DFloat &v, DFloat &w)
1292 {
1293     return (v < w || std::isnan(w) );
1294 }
1295 
1296 template<>
less(DDouble & v,DDouble & w)1297 inline bool less (DDouble &v, DDouble &w)
1298 {
1299     return (v < w || std::isnan(w) );
1300 }
1301 
1302 template<typename T>
leq(T & v,T & w)1303 inline bool leq (T &v, T &w)
1304 {
1305     return (v <= w);
1306 }
1307 
1308 template<>
leq(DFloat & v,DFloat & w)1309 inline bool leq (DFloat &v, DFloat &w)
1310 {
1311     return (v <= w || std::isnan(w) );
1312 }
1313 
1314 template<>
leq(DDouble & v,DDouble & w)1315 inline bool leq (DDouble &v, DDouble &w)
1316 {
1317     return (v <= w || std::isnan(w) );
1318 }
1319 
1320 template<typename T>
geq(T & v,T & w)1321 inline bool geq (T &v, T &w)
1322 {
1323     return (v >= w);
1324 }
1325 
1326 template<>
geq(DFloat & v,DFloat & w)1327 inline bool geq (DFloat &v, DFloat &w)
1328 {
1329     return (v >= w || std::isnan(v) );
1330 }
1331 
1332 template<>
geq(DDouble & v,DDouble & w)1333 inline bool geq (DDouble &v, DDouble &w)
1334 {
1335     return (v >= w || std::isnan(v) );
1336 }
1337 
1338 template<typename T>
eq(T & v,T & w)1339 inline bool eq (T &v, T &w)
1340 {
1341     return w == v;
1342 }
1343 
1344 template <typename IndexT>
swap(IndexT * z,SizeT a,SizeT b)1345 inline void swap(IndexT* z , SizeT a, SizeT b)
1346 {
1347     IndexT t = z[a];
1348     z[a] = z[b];
1349     z[b] = t;
1350 }
1351 
1352 template<typename T, typename IndexT>
insertionSortIndex(T * val,IndexT * index,SizeT lo,SizeT hi)1353 inline void insertionSortIndex (T* val, IndexT* index, SizeT lo, SizeT hi)
1354 {
1355     for (SizeT i = lo; i <= hi; ++i)
1356     {
1357         for (SizeT j = i; j > lo && less(val[index[j]], val[index[j-1]]); --j)
1358         {
1359             swap(index,j,j-1);
1360         }
1361     }
1362 }
1363 
1364 template<typename T, typename IndexT>
median3_for_qsort(T * val,IndexT * index,SizeT indI,SizeT indJ,SizeT indK)1365 inline SizeT median3_for_qsort ( T* val, IndexT* index, SizeT indI, SizeT indJ, SizeT indK)
1366 {
1367     return (less(val[index[ indI]], val[index[ indJ]]) ?
1368             (less(val[index[ indJ]], val[index[ indK]]) ? indJ : less(val[index[ indI]], val[index[ indK]]) ? indK : indI) :
1369             (less(val[index[ indK]], val[index[ indJ]]) ? indJ : less(val[index[ indK]], val[index[ indI]]) ? indK : indI));
1370 }
1371 
1372 
1373 template <typename T, typename IndexT>
QuickSortIndex(T * val,IndexT * index,DLong left,DLong right)1374  static void QuickSortIndex(T* val, IndexT* index, DLong left, DLong right)
1375 {
1376 
1377     if (right <= 0)
1378         return;
1379 
1380     DLong length = right - left + 1;
1381 
1382     // cutoff to insertion sort
1383     if (length < INSERTION_SORT_THRESHOLD)
1384     {
1385         insertionSortIndex(val, index, left, right);
1386         return;
1387     }
1388 
1389     // use median-of-3 as partitioning element
1390     else if (length < INSERTION_SORT_THRESHOLD*4)
1391     {
1392         DLong median = median3_for_qsort(val, index, left, left + length / 2, right);
1393         swap(index,median,left);
1394     }
1395 
1396     // use Tukey ninther as partitioning element
1397     else
1398     {
1399         DLong eps = length / 8;
1400         DLong mid = left + length / 2;
1401         DLong mFirst = median3_for_qsort(val, index, left, left + eps, left + eps + eps);
1402         DLong mMid = median3_for_qsort(val, index, mid - eps, mid, mid + eps);
1403         DLong mLast = median3_for_qsort(val, index, right - eps - eps, right - eps, right);
1404         DLong ninther = median3_for_qsort(val, index, mFirst, mMid, mLast);
1405         swap(index,ninther,left);
1406     }
1407 
1408     // Bentley-McIlroy 3-way partitioning
1409     DLong i = left, j = right + 1;
1410     DLong index1 = left, index2 = right + 1;
1411 
1412     for (;; )
1413     {
1414         T pivot = val[index[left]];
1415         while (less(val[index[++i]], pivot))
1416         {
1417             if (i == right)
1418                 break;
1419         }
1420         while (less(pivot, val[index[--j]]))
1421         {
1422             if (j == left)
1423                 break;
1424         }
1425         if (i >= j) break;
1426         swap(index,i,j);
1427         if (eq(val[index[i]], pivot))
1428             swap(index,++index1,i);
1429         if (eq(val[index[j]], pivot))
1430             swap(index,--index2,j);
1431     }
1432     swap(index,left,j);
1433 
1434     i = j + 1;
1435     j = j - 1;
1436     for (DLong k = left + 1; k <= index1; ++k)
1437     {
1438         swap(index,k,j--);
1439     }
1440     for (DLong k = right  ; k >= index2; --k)
1441     {
1442         swap(index,k,i++);
1443     }
1444 
1445 //    QuickSortIndex(val, index, left, j);
1446 //    QuickSortIndex(val, index, i, right);
1447 
1448      // same with parallelism
1449     DLong Left[2] = {left, i};
1450     DLong Right[2] = {j, right};
1451 #pragma omp parallel for num_threads(2) if (length >= MERGESORT_PARALLEL_THRESHOLD && CpuTPOOL_NTHREADS > 1)
1452     for (int i = 0; i < 2; i++) QuickSortIndex(val, index, Left[i], Right[i]);
1453 }
1454 
1455 
1456   template< typename T, typename IndexT>
MergeNoCopyIndexAux(IndexT * aux,IndexT * index,SizeT low,SizeT mid,SizeT high,T * val)1457    static void MergeNoCopyIndexAux(IndexT* aux, IndexT* index, SizeT low, SizeT mid, SizeT high, T* val)
1458   {
1459     SizeT i = low, j = mid + 1;
1460     for (SizeT k = low; k <= high; ++k) {
1461       if (i > mid) index[k] = aux[j++];
1462       else if (j > high) index[k] = aux[i++];
1463       else if (less(val[aux[j]] , val[aux[i]])) index[k] = aux[j++];
1464       else index[k] = aux[i++];
1465     }
1466   }
1467 
1468   template< typename T, typename IndexT>
MergeSortIndexAux(IndexT * aux,IndexT * index,SizeT low,SizeT high,T * val)1469    static void MergeSortIndexAux(IndexT* aux, IndexT* index, SizeT low, SizeT high, T* val)
1470   {
1471     SizeT length = high - low + 1;
1472     if (length < 2) return;
1473     if (length < INSERTION_SORT_THRESHOLD) {
1474       insertionSortIndex(val, index, low, high);
1475       memcpy(&(aux[low]), &(index[low]), length*sizeof(IndexT));
1476       return;
1477     }
1478 
1479 // else MERGESORT:
1480 
1481 //    SizeT mid = low + (high - low) / 2;
1482 //    MergeSortIndexAux(index, aux, low, mid, val);
1483 //    MergeSortIndexAux(index, aux, mid+1, high, val);
1484 
1485 //     same with parallelism
1486     SizeT mid = low + (high - low) / 2;
1487     SizeT Left[2] = {low, mid + 1};
1488     SizeT Right[2] = {mid, high};
1489 #pragma omp parallel for num_threads(2) if (length >= MERGESORT_PARALLEL_THRESHOLD && CpuTPOOL_NTHREADS > 1)
1490     for (int i = 0; i < 2; i++) MergeSortIndexAux(index, aux, Left[i], Right[i], val);
1491 
1492     // If arrays are already sorted, finished.  This is an
1493     // optimization that results in faster sorts for nearly ordered lists.
1494     if (val[aux[mid + 1]] >= val[aux[mid]]) {
1495       memcpy(&(index[low]), &(aux[low]), length * sizeof (IndexT)); //give back sub
1496       return;
1497     }
1498 
1499     // If arrays are inverted just swap.
1500     if (leq(val[aux[high]], val[aux[low]])) {
1501       SizeT left = mid - low + 1;
1502       SizeT right = high - mid;
1503       // swap parts:
1504       memmove(&(index[low]), &(aux[low]), left * sizeof (IndexT)); //copy 'left' values in aux
1505       memmove(&(aux[low]), &(aux[mid + 1]), right * sizeof (IndexT)); //copy 'right' values starting at low
1506       memmove(&(aux[low + right]), &(index[low]), left * sizeof (IndexT)); //give back aux
1507       memcpy(&(index[low]), &(aux[low]), length * sizeof (IndexT)); //give back sub
1508       return;
1509     }
1510 
1511     MergeNoCopyIndexAux(aux, index, low, mid, high, val);
1512   }
1513 
1514   //This should not be used for FLOATS and DOUBLES as the sorting order is (better) but different from IDL.
1515   template< typename T, typename IndexT>
AdaptiveSortIndexAux(IndexT * aux,IndexT * index,SizeT low,SizeT high,T * val)1516    static void AdaptiveSortIndexAux(IndexT* aux, IndexT* index, SizeT low, SizeT high, T* val)
1517   {
1518     SizeT length = high - low + 1;
1519     if (length < 2) return;
1520     if (length < INSERTION_SORT_THRESHOLD) {
1521       insertionSortIndex(val, index, low, high);
1522       return;
1523     }
1524     //  RadixSort is stable and differentiates -0 and +0.
1525       //  InsertionSort and Mergesort are stable but do not differentiate -0 and +0
1526       //  Quicksort is not stable https://en.wikipedia.org/wiki/Sorting_algorithm#Stability (does not keep temporal coherence)
1527       //  and do not differentiate -0 and +0
1528       //  Quicksort should not be permitted for the default sorting algorithms, but since IDL says that
1529       //  "If Array contains any identical elements, the order in which the identical elements
1530       //  are sorted is arbitrary and may vary between operating systems.", I permit it.
1531 
1532     else if (length < RADIX_SORT_THRESHOLD_ADAPT) { //could be faster if alloc/dealloc was not performed in RadixSort...
1533       RadixSortIndex(val, index, low, high);
1534       return;
1535     }
1536 // else MERGESORT
1537     //    SizeT mid = low + (high - low) / 2;
1538     //    AdaptiveSortIndexAux(index, aux, low, mid, val);
1539     //    AdaptiveSortIndexAux(index, aux, mid+1, high, val);
1540 
1541     // same with parallelism
1542     SizeT mid = low + (high - low) / 2;
1543     SizeT Left[2] = {low, mid + 1};
1544     SizeT Right[2] = {mid, high};
1545 #pragma omp parallel for num_threads(2) if (length >= MERGESORT_PARALLEL_THRESHOLD && CpuTPOOL_NTHREADS > 1)
1546     for (int i = 0; i < 2; i++) AdaptiveSortIndexAux(index, aux, Left[i], Right[i], val);
1547 
1548     // If arrays are already sorted, finished.  This is an
1549     // optimization that results in faster sorts for nearly ordered lists. No need to care about NaNs
1550     if (val[aux[mid + 1]] >= val[aux[mid]]) {
1551       memcpy(&(index[low]), &(aux[low]), length * sizeof (IndexT)); //give back sub
1552       return;
1553     }
1554 
1555     // If arrays are inverted just swap. No need to care about NaNs
1556     if (val[aux[high]] <= val[aux[low]]) {
1557       SizeT left = mid - low + 1;
1558       SizeT right = high - mid;
1559       // swap parts:
1560       memmove(&(index[low]), &(aux[low]), left * sizeof (IndexT)); //copy 'left' values in aux
1561       memmove(&(aux[low]), &(aux[mid + 1]), right * sizeof (IndexT)); //copy 'right' values starting at low
1562       memmove(&(aux[low + right]), &(index[low]), left * sizeof (IndexT)); //give back aux
1563       memcpy(&(index[low]), &(aux[low]), length * sizeof (IndexT)); //give back sub
1564       return;
1565     }
1566 
1567     MergeNoCopyIndexAux(aux, index, low, mid, high, val);
1568   }
1569 
1570   // the only difference is that sue to -NaN, RadixSort is forbidden for Floats and Doubles as the ranking with Radix() is not
1571   // the same as for other sorting alogorithms. Pity, as actually it is Better!
1572   template< typename T, typename IndexT>
AdaptiveSortIndexAuxWithNaN(IndexT * aux,IndexT * index,SizeT low,SizeT high,T * val)1573   static void AdaptiveSortIndexAuxWithNaN(IndexT* aux, IndexT* index, SizeT low, SizeT high, T* val) {
1574     SizeT length = high - low + 1;
1575     if (length < 2) return;
1576     if (length < INSERTION_SORT_THRESHOLD) {
1577       insertionSortIndex(val, index, low, high);
1578       return;
1579     }
1580     //ONLY MERGESORT
1581 
1582 //        SizeT mid = low + (high - low) / 2;
1583 //        AdaptiveSortIndexAuxWithNaN(index, aux, low, mid, val);
1584 //        AdaptiveSortIndexAuxWithNaN(index, aux, mid+1, high, val);
1585 
1586     // same with parallelism
1587     SizeT mid = low + (high - low) / 2;
1588     SizeT Left[2] = {low, mid + 1};
1589     SizeT Right[2] = {mid, high};
1590 #pragma omp parallel for num_threads(2) if (length >= MERGESORT_PARALLEL_THRESHOLD && CpuTPOOL_NTHREADS > 1)
1591     for (int i = 0; i < 2; i++) AdaptiveSortIndexAuxWithNaN(index, aux, Left[i], Right[i], val);
1592 
1593     // If arrays are already sorted, finished.  This is an
1594     // optimization that results in faster sorts for nearly ordered lists.
1595     if (geq(val[aux[mid + 1]] ,val[aux[mid]])) {
1596       memcpy(&(index[low]), &(aux[low]), length * sizeof (IndexT)); //give back sub
1597       return;
1598     }
1599 
1600     // If arrays are inverted just swap.
1601     if (leq(val[aux[high]], val[aux[low]])) {
1602       SizeT left = mid - low + 1;
1603       SizeT right = high - mid;
1604       // swap parts:
1605       memmove(&(index[low]), &(aux[low]), left * sizeof (IndexT)); //copy 'left' values in aux
1606       memmove(&(aux[low]), &(aux[mid + 1]), right * sizeof (IndexT)); //copy 'right' values starting at low
1607       memmove(&(aux[low + right]), &(index[low]), left * sizeof (IndexT)); //give back aux
1608       memcpy(&(index[low]), &(aux[low]), length * sizeof (IndexT)); //give back sub
1609       return;
1610     }
1611 
1612     MergeNoCopyIndexAux(aux, index, low, mid, high, val);
1613   }
1614 
1615   template< typename T, typename IndexT>
MergeSortIndex(T * val,IndexT * index,SizeT low,SizeT high)1616   inline void MergeSortIndex(T* val, IndexT* index, SizeT low, SizeT high)
1617   {
1618     IndexT* aux = new IndexT[high - low + 1];
1619     for (SizeT i = 0; i < high - low + 1; ++i) aux[i] = i;
1620     MergeSortIndexAux(aux, index, low, high, val);
1621     delete[] aux;
1622   }
1623 
1624   template< typename T, typename IndexT>
AdaptiveSortIndex(T * val,IndexT * index,SizeT low,SizeT high)1625   inline void AdaptiveSortIndex(T* val, IndexT* index, SizeT low, SizeT high)
1626   {
1627     IndexT* aux = new IndexT[high - low + 1];
1628     for (SizeT i = 0; i < high - low + 1; ++i) aux[i] = i;
1629     AdaptiveSortIndexAux(aux, index, low, high, val);
1630     delete[] aux;
1631   }
1632 
1633   template< typename T, typename IndexT>
AdaptiveSortIndexWithNaN(T * val,IndexT * index,SizeT low,SizeT high)1634   inline void AdaptiveSortIndexWithNaN(T* val, IndexT* index, SizeT low, SizeT high)
1635   {
1636     IndexT* aux = new IndexT[high - low + 1];
1637     for (SizeT i = 0; i < high - low + 1; ++i) aux[i] = i;
1638     AdaptiveSortIndexAuxWithNaN(aux, index, low, high, val);
1639     delete[] aux;
1640   }
1641 //--------------------------------------------------------------------------------------------------------------------
1642 // Sorting algos: The "private" GDL_SORT enables keywords QUICK,MERGE,RADIX,INSERT. Those are not there to for the user
1643 // to choose the algo (s)he wants. They are primarily to test the relative speed of each of them and find, for a given machine,
1644 // where the default algorithm, that is basically a clever combinaison of all of them, should switch from algo to algo
1645 // in order of retain the maximum efficiency.
1646 // One could use testsuite/benchmark/compare_sort_algos.pro to, e.g., update the magic values in the code, or, better, this can
1647 // be done automatically via a training procedure, writing results in a file in ~/.gdl and the present code could use, for example
1648 // values of !GDL_SORT_THRESHOLDS, a system variable (struct) whose values would be updated in the ~/.gdl file.
1649 // Default values chosen are for my intel i7, 4 cores, 8 threads.
1650 
gdl_sort_fun(EnvT * e)1651   BaseGDL* gdl_sort_fun( EnvT* e)
1652   {
1653     e->NParam(1);
1654 
1655     BaseGDL* p0 = e->GetParDefined(0);
1656 
1657     if (p0->Type() == GDL_STRUCT)
1658       e->Throw("Struct expression not allowed in this context: " +
1659         e->GetParString(0));
1660 
1661 
1662     static int l64Ix = e->KeywordIx("L64");
1663     bool l64 = e->KeywordSet(l64Ix);
1664 
1665     static int qsortIx = e->KeywordIx("QUICK");
1666     bool quick = e->KeywordSet(qsortIx);
1667     static int mergeIx = e->KeywordIx("MERGE");
1668     bool merge = e->KeywordSet(mergeIx);
1669     static int radixIx = e->KeywordIx("RADIX");
1670     bool radix = e->KeywordSet(radixIx);
1671     static int insertIx = e->KeywordIx("INSERT");
1672     bool insert = e->KeywordSet(insertIx);
1673     static int autoIx = e->KeywordIx("AUTO");
1674     bool doauto = e->KeywordSet(autoIx);
1675     if (!(radix || quick ||merge|| insert || doauto)) e->Throw("I need one of QUICK, MERGE, RADIX, INSERT or AUTO keyword set.");
1676 
1677     SizeT nEl = p0->N_Elements();
1678     if (insert && nEl > 50000) e->Throw("INSERT would take forever on "+i2s(nEl)+" elements");
1679     //Radix sort is special in that it is not obvious ---due to shortcuts--- which of the 2 internal storage
1680     //arrays is returned as sorted index.
1681     if (radix) {
1682       DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::NOALLOC);
1683       DLong* index;
1684       if (p0->Type() == GDL_DOUBLE) {
1685         DDouble* val = (DDouble*) (static_cast<DDoubleGDL*> (p0)->DataAddr());
1686         index = (DLong*) RadixSort<DULong>(val, nEl);
1687       } else if (p0->Type() == GDL_FLOAT) {
1688         DFloat* val = (DFloat*) (static_cast<DFloatGDL*> (p0)->DataAddr());
1689         index = (DLong*) RadixSort<DULong>(val, nEl);
1690       } else if (p0->Type() == GDL_LONG) {
1691         DLong* val = (DLong*) (static_cast<DLongGDL*> (p0)->DataAddr());
1692         index = (DLong*) RadixSort<DULong>(val, nEl);
1693       } else if (p0->Type() == GDL_ULONG) {
1694         DULong* val = (DULong*) (static_cast<DULongGDL*> (p0)->DataAddr());
1695         index = (DLong*) RadixSort<DULong>(val, nEl);
1696       } else if (p0->Type() == GDL_LONG64) {
1697         DLong64* val = (DLong64*) (static_cast<DLong64GDL*> (p0)->DataAddr());
1698         index = (DLong*) RadixSort<DULong>(val, nEl);
1699       } else if (p0->Type() == GDL_ULONG64) {
1700         DULong64* val = (DULong64*) (static_cast<DULong64GDL*> (p0)->DataAddr());
1701         index = (DLong*) RadixSort<DULong>(val, nEl);
1702       } else if (p0->Type() == GDL_INT) {
1703         DInt* val = (DInt*) (static_cast<DIntGDL*> (p0)->DataAddr());
1704         index = (DLong*) RadixSort<DULong>(val, nEl);
1705       } else if (p0->Type() == GDL_UINT) {
1706         DUInt* val = (DUInt*) (static_cast<DUIntGDL*> (p0)->DataAddr());
1707         index = (DLong*) RadixSort<DULong>(val, nEl);
1708       } else if (p0->Type() == GDL_BYTE) {
1709         DByte* val = (DByte*) (static_cast<DByteGDL*> (p0)->DataAddr());
1710         index = (DLong*) RadixSort<DULong>(val, nEl);
1711       } else if (p0->Type() == GDL_COMPLEX) {
1712         DComplexGDL* p0F = static_cast<DComplexGDL*> (p0);
1713         DComplex *ff = (DComplex*) p0F->DataAddr();
1714         // create temp values for magnitude of complex
1715         DFloat* magnitude = new DFloat[nEl];
1716         for (SizeT i = 0; i < nEl; ++i) magnitude[i] = std::norm(ff[i]);
1717         index = (DLong*) RadixSort<DULong>(magnitude, nEl);
1718         delete[] magnitude;
1719       } else if (p0->Type() == GDL_COMPLEXDBL) {
1720         DComplexDblGDL* p0F = static_cast<DComplexDblGDL*> (p0);
1721         DComplexDbl *ff = (DComplexDbl*) p0F->DataAddr();
1722         // create temp values for magnitude of complex
1723         DDouble* magnitude = new DDouble[nEl];
1724         for (SizeT i = 0; i < nEl; ++i) magnitude[i] = std::norm(ff[i]);
1725         index = (DLong*) RadixSort<DULong>(magnitude, nEl);
1726         delete[] magnitude;
1727       }
1728       res->SetBuffer(index);
1729       res->SetBufferSize(nEl);
1730       res->SetDim(dimension(nEl));
1731       return res;
1732     } else {
1733       if (p0->Type() == GDL_DOUBLE) {
1734         DDouble* val = (DDouble*) (static_cast<DDoubleGDL*> (p0)->DataAddr());
1735         DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1736         // NaNs are not well handled by the other sorts...
1737         DLong *hh = static_cast<DLong*> (res->DataAddr());
1738         SizeT low = 0;
1739         SizeT high = nEl - 1;
1740         if (merge) {
1741           MergeSortIndex(val, hh, low, high);
1742         } else if (quick) {
1743           QuickSortIndex(val, hh, low, high);
1744         } else if (insert) {
1745           insertionSortIndex(val, hh, low, high);
1746         } else {
1747           AdaptiveSortIndexWithNaN(val, hh, low, high);
1748         }
1749         return res;
1750       } else if (p0->Type() == GDL_FLOAT) {
1751         DFloat* val = (DFloat*) (static_cast<DFloatGDL*> (p0)->DataAddr());
1752         DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1753         // NaNs are not well handled by the other sorts...
1754         DLong *hh = static_cast<DLong*> (res->DataAddr());
1755         SizeT low = 0;
1756         SizeT high = nEl - 1;
1757         if (merge) {
1758           MergeSortIndex(val, hh, low, high);
1759         } else if (quick) {
1760           QuickSortIndex(val, hh, low, high);
1761         } else if (insert) {
1762           insertionSortIndex(val, hh, low, high);
1763         } else {
1764           AdaptiveSortIndexWithNaN(val, hh, low, high);
1765         }
1766         return res;
1767       } else if (p0->Type() == GDL_LONG) {
1768         DLong* val = (DLong*) (static_cast<DLongGDL*> (p0)->DataAddr());
1769         DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1770         DLong *hh = static_cast<DLong*> (res->DataAddr());
1771         SizeT low = 0;
1772         SizeT high = nEl - 1;
1773         if (merge) {
1774           MergeSortIndex(val, hh, low, high);
1775         } else if (quick) {
1776           QuickSortIndex(val, hh, low, high);
1777         } else if (insert) {
1778           insertionSortIndex(val, hh, low, high);
1779         } else {
1780           AdaptiveSortIndex(val, hh, low, high);
1781         }
1782         return res;
1783       } else if (p0->Type() == GDL_ULONG) {
1784         DULong* val = (DULong*) (static_cast<DULongGDL*> (p0)->DataAddr());
1785         DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1786         DLong *hh = static_cast<DLong*> (res->DataAddr());
1787         SizeT low = 0;
1788         SizeT high = nEl - 1;
1789         if (merge) {
1790           MergeSortIndex(val, hh, low, high);
1791         } else if (quick) {
1792           QuickSortIndex(val, hh, low, high);
1793         } else if (insert) {
1794           insertionSortIndex(val, hh, low, high);
1795         } else {
1796           AdaptiveSortIndex(val, hh, low, high);
1797         }
1798         return res;
1799       } else if (p0->Type() == GDL_LONG64) {
1800         DLong64* val = (DLong64*) (static_cast<DLong64GDL*> (p0)->DataAddr());
1801         DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1802         DLong *hh = static_cast<DLong*> (res->DataAddr());
1803         SizeT low = 0;
1804         SizeT high = nEl - 1;
1805         if (merge) {
1806           MergeSortIndex(val, hh, low, high);
1807         } else if (quick) {
1808           QuickSortIndex(val, hh, low, high);
1809         } else if (insert) {
1810           insertionSortIndex(val, hh, low, high);
1811         } else {
1812           AdaptiveSortIndex(val, hh, low, high);
1813         }
1814         return res;
1815       } else if (p0->Type() == GDL_ULONG64) {
1816         DULong64* val = (DULong64*) (static_cast<DULong64GDL*> (p0)->DataAddr());
1817         DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1818         DLong *hh = static_cast<DLong*> (res->DataAddr());
1819         SizeT low = 0;
1820         SizeT high = nEl - 1;
1821         if (merge) {
1822           MergeSortIndex(val, hh, low, high);
1823         } else if (quick) {
1824           QuickSortIndex(val, hh, low, high);
1825         } else if (insert) {
1826           insertionSortIndex(val, hh, low, high);
1827         } else {
1828           AdaptiveSortIndex(val, hh, low, high);
1829         }
1830         return res;
1831       } else if (p0->Type() == GDL_INT) {
1832         DInt* val = (DInt*) (static_cast<DIntGDL*> (p0)->DataAddr());
1833         DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1834         DLong *hh = static_cast<DLong*> (res->DataAddr());
1835         SizeT low = 0;
1836         SizeT high = nEl - 1;
1837         if (merge) {
1838           MergeSortIndex(val, hh, low, high);
1839         } else if (quick) {
1840           QuickSortIndex(val, hh, low, high);
1841         } else if (insert) {
1842           insertionSortIndex(val, hh, low, high);
1843         } else {
1844           AdaptiveSortIndex(val, hh, low, high);
1845         }
1846         return res;
1847       } else if (p0->Type() == GDL_UINT) {
1848         DUInt* val = (DUInt*) (static_cast<DUIntGDL*> (p0)->DataAddr());
1849         DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1850         DLong *hh = static_cast<DLong*> (res->DataAddr());
1851         SizeT low = 0;
1852         SizeT high = nEl - 1;
1853         if (merge) {
1854           MergeSortIndex(val, hh, low, high);
1855         } else if (quick) {
1856           QuickSortIndex(val, hh, low, high);
1857         } else if (insert) {
1858           insertionSortIndex(val, hh, low, high);
1859         } else {
1860           AdaptiveSortIndex(val, hh, low, high);
1861         }
1862         return res;
1863       } else if (p0->Type() == GDL_BYTE) {
1864         DByte* val = (DByte*) (static_cast<DByteGDL*> (p0)->DataAddr());
1865         DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1866         DLong *hh = static_cast<DLong*> (res->DataAddr());
1867         SizeT low = 0;
1868         SizeT high = nEl - 1;
1869         if (merge) {
1870           MergeSortIndex(val, hh, low, high);
1871         } else if (quick) {
1872           QuickSortIndex(val, hh, low, high);
1873         } else if (insert) {
1874           insertionSortIndex(val, hh, low, high);
1875         } else {
1876           AdaptiveSortIndex(val, hh, low, high);
1877         }
1878         return res;
1879       } else if (p0->Type() == GDL_COMPLEX) {
1880         DComplexGDL* p0F = static_cast<DComplexGDL*> (p0);
1881         DComplex *ff = (DComplex*) p0F->DataAddr();
1882         // create temp values for magnitude of complex
1883         DFloat* magnitude = new DFloat[nEl];
1884         for (SizeT i = 0; i < nEl; ++i) magnitude[i] = std::norm(ff[i]);
1885         DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1886         DLong *hh = static_cast<DLong*> (res->DataAddr());
1887         SizeT low = 0;
1888         SizeT high = nEl - 1;
1889         if (merge) {
1890           MergeSortIndex(magnitude, hh, low, high);
1891         } else if (quick) {
1892           QuickSortIndex(magnitude, hh, low, high);
1893         } else if (insert) {
1894           insertionSortIndex(magnitude, hh, low, high);
1895         } else {
1896           AdaptiveSortIndexWithNaN(magnitude, hh, low, high);
1897         }
1898         delete[] magnitude;
1899         return res;
1900       } else if (p0->Type() == GDL_COMPLEXDBL) {
1901         DComplexDblGDL* p0F = static_cast<DComplexDblGDL*> (p0);
1902         DComplexDbl *ff = (DComplexDbl*) p0F->DataAddr();
1903         // create temp values for magnitude of complex
1904         DDouble* magnitude = new DDouble[nEl];
1905         for (SizeT i = 0; i < nEl; ++i) magnitude[i] = std::norm(ff[i]);
1906         DLongGDL* res = new DLongGDL(dimension(nEl), BaseGDL::INDGEN);
1907         DLong *hh = static_cast<DLong*> (res->DataAddr());
1908         SizeT low = 0;
1909         SizeT high = nEl - 1;
1910         if (merge) {
1911           MergeSortIndex(magnitude, hh, low, high);
1912         } else if (quick) {
1913           QuickSortIndex(magnitude, hh, low, high);
1914         } else if (insert) {
1915           insertionSortIndex(magnitude, hh, low, high);
1916         } else {
1917           AdaptiveSortIndexWithNaN(magnitude, hh, low, high);
1918         }
1919         delete[] magnitude;
1920         return res;
1921       } else e->Throw("FIXME.");
1922     }
1923    return NULL;
1924   }
1925 
1926   template<typename GDLIndexT, typename IndexT>
do_sort_fun(BaseGDL * p0)1927   inline BaseGDL* do_sort_fun(BaseGDL* p0)
1928   {
1929     SizeT nEl = p0->N_Elements();
1930     if (p0->Type() == GDL_BYTE) { //lack of 'res' creation overhead makes "Bytes Radix Sort" better than anything else.
1931       DByte* val = (DByte*)(static_cast<DByteGDL*>(p0)->DataAddr());
1932       GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::NOALLOC);
1933       IndexT *index;
1934       index=RadixSort<IndexT>( val, nEl);
1935       res->SetBuffer(index);
1936       res->SetBufferSize(nEl);
1937       res->SetDim(dimension(nEl));
1938       return res;
1939     } else if (p0->Type() == GDL_INT) { //still true for INTs
1940       DInt* val = (DInt*)(static_cast<DIntGDL*>(p0)->DataAddr());
1941       GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::NOALLOC);
1942       IndexT *index;
1943       index=RadixSort<IndexT>( val, nEl);
1944       res->SetBuffer(index);
1945       res->SetBufferSize(nEl);
1946       res->SetDim(dimension(nEl));
1947       return res;
1948     } else if (p0->Type() == GDL_UINT) { //still true for UINTs
1949       DUInt* val = (DUInt*)(static_cast<DUIntGDL*>(p0)->DataAddr());
1950       GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::NOALLOC);
1951       IndexT *index;
1952       index=RadixSort<IndexT>( val, nEl);
1953       res->SetBuffer(index);
1954       res->SetBufferSize(nEl);
1955       res->SetDim(dimension(nEl));
1956       return res;
1957     } else if (p0->Type() == GDL_FLOAT) { //adaptive, due to the -NaN different sorting
1958       DFloat* val = (DFloat*)(static_cast<DFloatGDL*>(p0)->DataAddr());
1959       GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
1960       IndexT *hh = static_cast<IndexT*> (res->DataAddr());
1961       SizeT low=0;
1962       SizeT high=nEl-1;
1963       AdaptiveSortIndexWithNaN<DFloat, IndexT>( val, hh, low, high);
1964       return res;
1965     } else if (p0->Type() == GDL_DOUBLE) { //adaptive, due to the -NaN different sorting
1966       DDouble* val = (DDouble*)(static_cast<DDoubleGDL*>(p0)->DataAddr());
1967       GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
1968       IndexT *hh = static_cast<IndexT*> (res->DataAddr());
1969       SizeT low=0;
1970       SizeT high=nEl-1;
1971       AdaptiveSortIndexWithNaN<DDouble, IndexT>( val, hh, low, high);
1972       return res;
1973     } else if (p0->Type() == GDL_COMPLEX) {//adaptive, due to the -NaN different sorting
1974       DComplexGDL* p0F = static_cast<DComplexGDL*> (p0);
1975       DComplex *ff=(DComplex*)p0F->DataAddr();
1976       // create temp values for magnitude of complex
1977       DFloat* magnitude=new DFloat[nEl];
1978       for (SizeT i=0; i< nEl; ++i) magnitude[i]=std::norm(ff[i]);
1979       GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
1980       IndexT *hh = static_cast<IndexT*> (res->DataAddr());
1981       SizeT low=0;
1982       SizeT high=nEl-1;
1983       AdaptiveSortIndexWithNaN<DFloat, IndexT>( magnitude, hh, low, high);
1984       delete[] magnitude;
1985       return res;
1986     } else if (p0->Type() == GDL_COMPLEXDBL) {//adaptive, due to the -NaN different sorting
1987       DComplexDblGDL* p0F = static_cast<DComplexDblGDL*> (p0);
1988       DComplexDbl *ff=(DComplexDbl*)p0F->DataAddr();
1989       // create temp values for magnitude of complex
1990       DDouble* magnitude=new DDouble[nEl];
1991       for (SizeT i=0; i< nEl; ++i) magnitude[i]=std::norm(ff[i]);
1992       GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
1993       IndexT *hh = static_cast<IndexT*> (res->DataAddr());
1994       SizeT low=0;
1995       SizeT high=nEl-1;
1996       AdaptiveSortIndexWithNaN<DDouble, IndexT>( magnitude, hh, low, high);
1997       delete[] magnitude;
1998       return res;
1999     } else if (p0->Type() == GDL_LONG && nEl > RADIX_SORT_THRESHOLD_FOR_LONG) {
2000       DLong* val = (DLong*)(static_cast<DLongGDL*>(p0)->DataAddr());
2001       GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
2002       IndexT *hh = static_cast<IndexT*> (res->DataAddr());
2003       SizeT low=0;
2004       SizeT high=nEl-1;
2005       AdaptiveSortIndex<DLong, IndexT>( val, hh, low, high);
2006       return res;
2007     } else if (p0->Type() == GDL_LONG && nEl <= RADIX_SORT_THRESHOLD_FOR_LONG) {
2008       DLong* val = (DLong*)(static_cast<DLongGDL*>(p0)->DataAddr());
2009       GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::NOALLOC);
2010       IndexT *index;
2011       index=RadixSort<IndexT>( val, nEl);
2012       res->SetBuffer(index);
2013       res->SetBufferSize(nEl);
2014       res->SetDim(dimension(nEl));
2015       return res;
2016     } else if (p0->Type() == GDL_ULONG && nEl > RADIX_SORT_THRESHOLD_FOR_ULONG) {
2017       DULong* val = (DULong*)(static_cast<DULongGDL*>(p0)->DataAddr());
2018       GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
2019       IndexT *hh = static_cast<IndexT*> (res->DataAddr());
2020       SizeT low=0;
2021       SizeT high=nEl-1;
2022       AdaptiveSortIndex<DULong, IndexT>( val, hh, low, high);
2023       return res;
2024     } else if (p0->Type() == GDL_ULONG && nEl <= RADIX_SORT_THRESHOLD_FOR_ULONG) {
2025       DULong* val = (DULong*)(static_cast<DULongGDL*>(p0)->DataAddr());
2026       GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::NOALLOC);
2027       IndexT *index;
2028       index=RadixSort<IndexT>( val, nEl);
2029       res->SetBuffer(index);
2030       res->SetBufferSize(nEl);
2031       res->SetDim(dimension(nEl));
2032       return res;
2033     } else if (p0->Type() == GDL_LONG64 && nEl > RADIX_SORT_THRESHOLD_FOR_LONG64) {
2034       DLong64* val = (DLong64*)(static_cast<DLong64GDL*>(p0)->DataAddr());
2035       GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
2036       IndexT *hh = static_cast<IndexT*> (res->DataAddr());
2037       SizeT low=0;
2038       SizeT high=nEl-1;
2039       AdaptiveSortIndex<DLong64, IndexT>( val, hh, low, high);
2040       return res;
2041     } else if (p0->Type() == GDL_LONG64 && nEl <= RADIX_SORT_THRESHOLD_FOR_LONG64) {
2042       DLong64* val = (DLong64*)(static_cast<DLong64GDL*>(p0)->DataAddr());
2043       GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::NOALLOC);
2044       IndexT *index;
2045       index=RadixSort<IndexT>( val, nEl);
2046       res->SetBuffer(index);
2047       res->SetBufferSize(nEl);
2048       res->SetDim(dimension(nEl));
2049       return res;
2050     } else if (p0->Type() == GDL_ULONG64 && nEl > RADIX_SORT_THRESHOLD_FOR_ULONG64) {
2051       DULong64* val = (DULong64*)(static_cast<DULong64GDL*>(p0)->DataAddr());
2052       GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
2053       IndexT *hh = static_cast<IndexT*> (res->DataAddr());
2054       SizeT low=0;
2055       SizeT high=nEl-1;
2056       AdaptiveSortIndex<DULong64, IndexT>( val, hh, low, high);
2057       return res;
2058     } else if (p0->Type() == GDL_ULONG64 && nEl <= RADIX_SORT_THRESHOLD_FOR_ULONG64) {
2059       DULong64* val = (DULong64*)(static_cast<DULong64GDL*>(p0)->DataAddr());
2060       GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::NOALLOC);
2061       IndexT *index;
2062       index=RadixSort<IndexT>( val, nEl);
2063       res->SetBuffer(index);
2064       res->SetBufferSize(nEl);
2065       res->SetDim(dimension(nEl));
2066       return res;
2067     } else if (p0->Type() == GDL_STRING) {
2068       DString* val = (DString*)(static_cast<DStringGDL*>(p0)->DataAddr());
2069       GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
2070       IndexT *hh = static_cast<IndexT*> (res->DataAddr());
2071       SizeT low=0;
2072       SizeT high=nEl-1;
2073       QuickSortIndex<DString, IndexT>( val, hh, low, high);
2074       return res;
2075     } else if (p0->Type() == GDL_PTR) {
2076         // actually it sorts the index in heap.
2077       DPtr* val = (DPtr*)(static_cast<DPtrGDL*>(p0)->DataAddr()); //heap indexes
2078       GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
2079       IndexT *hh = static_cast<IndexT*> (res->DataAddr());
2080       SizeT low=0;
2081       SizeT high=nEl-1;
2082       AdaptiveSortIndex<DPtr, IndexT>( val, hh, low, high);
2083       return res;
2084     } else if (p0->Type() == GDL_OBJ) {
2085         // idem?
2086       DObj* val = (DObj*)(static_cast<DObjGDL*>(p0)->DataAddr()); //heap indexes
2087       GDLIndexT* res = new GDLIndexT(dimension(nEl), BaseGDL::INDGEN);
2088       IndexT *hh = static_cast<IndexT*> (res->DataAddr());
2089       SizeT low=0;
2090       SizeT high=nEl-1;
2091       AdaptiveSortIndex<DObj, IndexT>( val, hh, low, high);
2092       return res;
2093     }
2094     return NULL;
2095   }
sort_fun(EnvT * e)2096   BaseGDL* sort_fun( EnvT* e)
2097   {
2098     BaseGDL* p0 = e->GetParDefined(0);
2099     if (p0->Type() == GDL_STRUCT) e->Throw("Struct expression not allowed in this context: " +e->GetParString(0));
2100     static int l64Ix = e->KeywordIx("L64");
2101     bool l64 = e->KeywordSet(l64Ix);
2102     if (!l64) return do_sort_fun<DLongGDL,DLong>(p0);
2103     else return do_sort_fun<DLong64GDL,DLong64>(p0);
2104   }
2105 }
2106