1 /* pp_sort.c
2 *
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
4 * 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11 /*
12 * ...they shuffled back towards the rear of the line. 'No, not at the
13 * rear!' the slave-driver shouted. 'Three files up. And stay there...
14 */
15
16 /* This file contains pp ("push/pop") functions that
17 * execute the opcodes that make up a perl program. A typical pp function
18 * expects to find its arguments on the stack, and usually pushes its
19 * results onto the stack, hence the 'pp' terminology. Each OP structure
20 * contains a pointer to the relevant pp_foo() function.
21 *
22 * This particular file just contains pp_sort(), which is complex
23 * enough to merit its own file! See the other pp*.c files for the rest of
24 * the pp_ functions.
25 */
26
27 #if defined(UNDER_CE)
28 /* looks like 'small' is reserved word for WINCE (or somesuch)*/
29 #define small xsmall
30 #endif
31
32 #ifndef SMALLSORT
33 #define SMALLSORT (200)
34 #endif
35
36 /*
37 * The mergesort implementation is by Peter M. Mcilroy <pmcilroy@lucent.com>.
38 *
39 * The original code was written in conjunction with BSD Computer Software
40 * Research Group at University of California, Berkeley.
41 *
42 * See also: "Optimistic Merge Sort" (SODA '92)
43 *
44 * The integration to Perl is by John P. Linderman <jpl@research.att.com>.
45 *
46 * The code can be distributed under the same terms as Perl itself.
47 *
48 */
49
50 /* Binary merge internal sort, with a few special mods
51 ** for the special perl environment it now finds itself in.
52 **
53 ** Things that were once options have been hotwired
54 ** to values suitable for this use. In particular, we'll always
55 ** initialize looking for natural runs, we'll always produce stable
56 ** output, and we'll always do Peter McIlroy's binary merge.
57 */
58
59 /* Pointer types for arithmetic and storage and convenience casts */
60
61 #define GPTP(P) ((SV **)(P))
62 #define GPPP(P) ((SV ***)(P))
63
64
65 /* byte offset from pointer P to (larger) pointer Q */
66 #define BYTEOFF(P, Q) (((char *)(Q)) - ((char *)(P)))
67
68 #define PSIZE sizeof(SV *)
69
70 /* If PSIZE is power of 2, make PSHIFT that power, if that helps */
71
72 #ifdef PSHIFT
73 #define PNELEM(P, Q) (BYTEOFF(P,Q) >> (PSHIFT))
74 #define PNBYTE(N) ((N) << (PSHIFT))
75 #define PINDEX(P, N) (GPTP((char *)(P) + PNBYTE(N)))
76 #else
77 /* Leave optimization to compiler */
78 #define PNELEM(P, Q) (GPTP(Q) - GPTP(P))
79 #define PNBYTE(N) ((N) * (PSIZE))
80 #define PINDEX(P, N) (GPTP(P) + (N))
81 #endif
82
83 /* Pointer into other corresponding to pointer into this */
84 #define POTHER(P, THIS, OTHER) GPTP(((char *)(OTHER)) + BYTEOFF(THIS,P))
85
86 #define FROMTOUPTO(src, dst, lim) do *dst++ = *src++; while(src<lim)
87
88
89 /* Runs are identified by a pointer in the auxilliary list.
90 ** The pointer is at the start of the list,
91 ** and it points to the start of the next list.
92 ** NEXT is used as an lvalue, too.
93 */
94
95 #define NEXT(P) (*GPPP(P))
96
97
98 /* PTHRESH is the minimum number of pairs with the same sense to justify
99 ** checking for a run and extending it. Note that PTHRESH counts PAIRS,
100 ** not just elements, so PTHRESH == 8 means a run of 16.
101 */
102
103 #define PTHRESH (8)
104
105 /* RTHRESH is the number of elements in a run that must compare low
106 ** to the low element from the opposing run before we justify
107 ** doing a binary rampup instead of single stepping.
108 ** In random input, N in a row low should only happen with
109 ** probability 2^(1-N), so we can risk that we are dealing
110 ** with orderly input without paying much when we aren't.
111 */
112
113 #define RTHRESH (6)
114
115
116 /*
117 ** Overview of algorithm and variables.
118 ** The array of elements at list1 will be organized into runs of length 2,
119 ** or runs of length >= 2 * PTHRESH. We only try to form long runs when
120 ** PTHRESH adjacent pairs compare in the same way, suggesting overall order.
121 **
122 ** Unless otherwise specified, pair pointers address the first of two elements.
123 **
124 ** b and b+1 are a pair that compare with sense ``sense''.
125 ** b is the ``bottom'' of adjacent pairs that might form a longer run.
126 **
127 ** p2 parallels b in the list2 array, where runs are defined by
128 ** a pointer chain.
129 **
130 ** t represents the ``top'' of the adjacent pairs that might extend
131 ** the run beginning at b. Usually, t addresses a pair
132 ** that compares with opposite sense from (b,b+1).
133 ** However, it may also address a singleton element at the end of list1,
134 ** or it may be equal to ``last'', the first element beyond list1.
135 **
136 ** r addresses the Nth pair following b. If this would be beyond t,
137 ** we back it off to t. Only when r is less than t do we consider the
138 ** run long enough to consider checking.
139 **
140 ** q addresses a pair such that the pairs at b through q already form a run.
141 ** Often, q will equal b, indicating we only are sure of the pair itself.
142 ** However, a search on the previous cycle may have revealed a longer run,
143 ** so q may be greater than b.
144 **
145 ** p is used to work back from a candidate r, trying to reach q,
146 ** which would mean b through r would be a run. If we discover such a run,
147 ** we start q at r and try to push it further towards t.
148 ** If b through r is NOT a run, we detect the wrong order at (p-1,p).
149 ** In any event, after the check (if any), we have two main cases.
150 **
151 ** 1) Short run. b <= q < p <= r <= t.
152 ** b through q is a run (perhaps trivial)
153 ** q through p are uninteresting pairs
154 ** p through r is a run
155 **
156 ** 2) Long run. b < r <= q < t.
157 ** b through q is a run (of length >= 2 * PTHRESH)
158 **
159 ** Note that degenerate cases are not only possible, but likely.
160 ** For example, if the pair following b compares with opposite sense,
161 ** then b == q < p == r == t.
162 */
163
164
165 static IV
dynprep(pTHX_ SV ** list1,SV ** list2,size_t nmemb,SVCOMPARE_t cmp)166 dynprep(pTHX_ SV **list1, SV **list2, size_t nmemb, SVCOMPARE_t cmp)
167 {
168 I32 sense;
169 register SV **b, **p, **q, **t, **p2;
170 register SV *c, **last, **r;
171 SV **savep;
172 IV runs = 0;
173
174 b = list1;
175 last = PINDEX(b, nmemb);
176 sense = (cmp(aTHX_ *b, *(b+1)) > 0);
177 for (p2 = list2; b < last; ) {
178 /* We just started, or just reversed sense.
179 ** Set t at end of pairs with the prevailing sense.
180 */
181 for (p = b+2, t = p; ++p < last; t = ++p) {
182 if ((cmp(aTHX_ *t, *p) > 0) != sense) break;
183 }
184 q = b;
185 /* Having laid out the playing field, look for long runs */
186 do {
187 p = r = b + (2 * PTHRESH);
188 if (r >= t) p = r = t; /* too short to care about */
189 else {
190 while (((cmp(aTHX_ *(p-1), *p) > 0) == sense) &&
191 ((p -= 2) > q));
192 if (p <= q) {
193 /* b through r is a (long) run.
194 ** Extend it as far as possible.
195 */
196 p = q = r;
197 while (((p += 2) < t) &&
198 ((cmp(aTHX_ *(p-1), *p) > 0) == sense)) q = p;
199 r = p = q + 2; /* no simple pairs, no after-run */
200 }
201 }
202 if (q > b) { /* run of greater than 2 at b */
203 savep = p;
204 p = q += 2;
205 /* pick up singleton, if possible */
206 if ((p == t) &&
207 ((t + 1) == last) &&
208 ((cmp(aTHX_ *(p-1), *p) > 0) == sense))
209 savep = r = p = q = last;
210 p2 = NEXT(p2) = p2 + (p - b); ++runs;
211 if (sense) while (b < --p) {
212 c = *b;
213 *b++ = *p;
214 *p = c;
215 }
216 p = savep;
217 }
218 while (q < p) { /* simple pairs */
219 p2 = NEXT(p2) = p2 + 2; ++runs;
220 if (sense) {
221 c = *q++;
222 *(q-1) = *q;
223 *q++ = c;
224 } else q += 2;
225 }
226 if (((b = p) == t) && ((t+1) == last)) {
227 NEXT(p2) = p2 + 1; ++runs;
228 b++;
229 }
230 q = r;
231 } while (b < t);
232 sense = !sense;
233 }
234 return runs;
235 }
236
237
238 /* The original merge sort, in use since 5.7, was as fast as, or faster than,
239 * qsort on many platforms, but slower than qsort, conspicuously so,
240 * on others. The most likely explanation was platform-specific
241 * differences in cache sizes and relative speeds.
242 *
243 * The quicksort divide-and-conquer algorithm guarantees that, as the
244 * problem is subdivided into smaller and smaller parts, the parts
245 * fit into smaller (and faster) caches. So it doesn't matter how
246 * many levels of cache exist, quicksort will "find" them, and,
247 * as long as smaller is faster, take advanatge of them.
248 *
249 * By contrast, consider how the original mergesort algorithm worked.
250 * Suppose we have five runs (each typically of length 2 after dynprep).
251 *
252 * pass base aux
253 * 0 1 2 3 4 5
254 * 1 12 34 5
255 * 2 1234 5
256 * 3 12345
257 * 4 12345
258 *
259 * Adjacent pairs are merged in "grand sweeps" through the input.
260 * This means, on pass 1, the records in runs 1 and 2 aren't revisited until
261 * runs 3 and 4 are merged and the runs from run 5 have been copied.
262 * The only cache that matters is one large enough to hold *all* the input.
263 * On some platforms, this may be many times slower than smaller caches.
264 *
265 * The following pseudo-code uses the same basic merge algorithm,
266 * but in a divide-and-conquer way.
267 *
268 * # merge $runs runs at offset $offset of list $list1 into $list2.
269 * # all unmerged runs ($runs == 1) originate in list $base.
270 * sub mgsort2 {
271 * my ($offset, $runs, $base, $list1, $list2) = @_;
272 *
273 * if ($runs == 1) {
274 * if ($list1 is $base) copy run to $list2
275 * return offset of end of list (or copy)
276 * } else {
277 * $off2 = mgsort2($offset, $runs-($runs/2), $base, $list2, $list1)
278 * mgsort2($off2, $runs/2, $base, $list2, $list1)
279 * merge the adjacent runs at $offset of $list1 into $list2
280 * return the offset of the end of the merged runs
281 * }
282 * }
283 * mgsort2(0, $runs, $base, $aux, $base);
284 *
285 * For our 5 runs, the tree of calls looks like
286 *
287 * 5
288 * 3 2
289 * 2 1 1 1
290 * 1 1
291 *
292 * 1 2 3 4 5
293 *
294 * and the corresponding activity looks like
295 *
296 * copy runs 1 and 2 from base to aux
297 * merge runs 1 and 2 from aux to base
298 * (run 3 is where it belongs, no copy needed)
299 * merge runs 12 and 3 from base to aux
300 * (runs 4 and 5 are where they belong, no copy needed)
301 * merge runs 4 and 5 from base to aux
302 * merge runs 123 and 45 from aux to base
303 *
304 * Note that we merge runs 1 and 2 immediately after copying them,
305 * while they are still likely to be in fast cache. Similarly,
306 * run 3 is merged with run 12 while it still may be lingering in cache.
307 * This implementation should therefore enjoy much of the cache-friendly
308 * behavior that quicksort does. In addition, it does less copying
309 * than the original mergesort implementation (only runs 1 and 2 are copied)
310 * and the "balancing" of merges is better (merged runs comprise more nearly
311 * equal numbers of original runs).
312 *
313 * The actual cache-friendly implementation will use a pseudo-stack
314 * to avoid recursion, and will unroll processing of runs of length 2,
315 * but it is otherwise similar to the recursive implementation.
316 */
317
318 typedef struct {
319 IV offset; /* offset of 1st of 2 runs at this level */
320 IV runs; /* how many runs must be combined into 1 */
321 } off_runs; /* pseudo-stack element */
322
323 static void
sortsv(pTHX_ SV ** base,size_t nmemb,SVCOMPARE_t cmp)324 sortsv(pTHX_ SV **base, size_t nmemb, SVCOMPARE_t cmp)
325 {
326 IV i, run, runs, offset;
327 I32 sense, level;
328 int iwhich;
329 register SV **f1, **f2, **t, **b, **p, **tp2, **l1, **l2, **q;
330 SV **aux, **list1, **list2;
331 SV **p1;
332 SV * small[SMALLSORT];
333 SV **which[3];
334 off_runs stack[60], *stackp;
335 SVCOMPARE_t savecmp = 0;
336
337 if (nmemb <= 1) return; /* sorted trivially */
338
339 if (nmemb <= SMALLSORT) aux = small; /* use stack for aux array */
340 else { New(799,aux,nmemb,SV *); } /* allocate auxilliary array */
341 level = 0;
342 stackp = stack;
343 stackp->runs = dynprep(aTHX_ base, aux, nmemb, cmp);
344 stackp->offset = offset = 0;
345 which[0] = which[2] = base;
346 which[1] = aux;
347 for (;;) {
348 /* On levels where both runs have be constructed (stackp->runs == 0),
349 * merge them, and note the offset of their end, in case the offset
350 * is needed at the next level up. Hop up a level, and,
351 * as long as stackp->runs is 0, keep merging.
352 */
353 if ((runs = stackp->runs) == 0) {
354 iwhich = level & 1;
355 list1 = which[iwhich]; /* area where runs are now */
356 list2 = which[++iwhich]; /* area for merged runs */
357 do {
358 offset = stackp->offset;
359 f1 = p1 = list1 + offset; /* start of first run */
360 p = tp2 = list2 + offset; /* where merged run will go */
361 t = NEXT(p); /* where first run ends */
362 f2 = l1 = POTHER(t, list2, list1); /* ... on the other side */
363 t = NEXT(t); /* where second runs ends */
364 l2 = POTHER(t, list2, list1); /* ... on the other side */
365 offset = PNELEM(list2, t);
366 while (f1 < l1 && f2 < l2) {
367 /* If head 1 is larger than head 2, find ALL the elements
368 ** in list 2 strictly less than head1, write them all,
369 ** then head 1. Then compare the new heads, and repeat,
370 ** until one or both lists are exhausted.
371 **
372 ** In all comparisons (after establishing
373 ** which head to merge) the item to merge
374 ** (at pointer q) is the first operand of
375 ** the comparison. When we want to know
376 ** if ``q is strictly less than the other'',
377 ** we can't just do
378 ** cmp(q, other) < 0
379 ** because stability demands that we treat equality
380 ** as high when q comes from l2, and as low when
381 ** q was from l1. So we ask the question by doing
382 ** cmp(q, other) <= sense
383 ** and make sense == 0 when equality should look low,
384 ** and -1 when equality should look high.
385 */
386
387
388 if (cmp(aTHX_ *f1, *f2) <= 0) {
389 q = f2; b = f1; t = l1;
390 sense = -1;
391 } else {
392 q = f1; b = f2; t = l2;
393 sense = 0;
394 }
395
396
397 /* ramp up
398 **
399 ** Leave t at something strictly
400 ** greater than q (or at the end of the list),
401 ** and b at something strictly less than q.
402 */
403 for (i = 1, run = 0 ;;) {
404 if ((p = PINDEX(b, i)) >= t) {
405 /* off the end */
406 if (((p = PINDEX(t, -1)) > b) &&
407 (cmp(aTHX_ *q, *p) <= sense))
408 t = p;
409 else b = p;
410 break;
411 } else if (cmp(aTHX_ *q, *p) <= sense) {
412 t = p;
413 break;
414 } else b = p;
415 if (++run >= RTHRESH) i += i;
416 }
417
418
419 /* q is known to follow b and must be inserted before t.
420 ** Increment b, so the range of possibilities is [b,t).
421 ** Round binary split down, to favor early appearance.
422 ** Adjust b and t until q belongs just before t.
423 */
424
425 b++;
426 while (b < t) {
427 p = PINDEX(b, (PNELEM(b, t) - 1) / 2);
428 if (cmp(aTHX_ *q, *p) <= sense) {
429 t = p;
430 } else b = p + 1;
431 }
432
433
434 /* Copy all the strictly low elements */
435
436 if (q == f1) {
437 FROMTOUPTO(f2, tp2, t);
438 *tp2++ = *f1++;
439 } else {
440 FROMTOUPTO(f1, tp2, t);
441 *tp2++ = *f2++;
442 }
443 }
444
445
446 /* Run out remaining list */
447 if (f1 == l1) {
448 if (f2 < l2) FROMTOUPTO(f2, tp2, l2);
449 } else FROMTOUPTO(f1, tp2, l1);
450 p1 = NEXT(p1) = POTHER(tp2, list2, list1);
451
452 if (--level == 0) goto done;
453 --stackp;
454 t = list1; list1 = list2; list2 = t; /* swap lists */
455 } while ((runs = stackp->runs) == 0);
456 }
457
458
459 stackp->runs = 0; /* current run will finish level */
460 /* While there are more than 2 runs remaining,
461 * turn them into exactly 2 runs (at the "other" level),
462 * each made up of approximately half the runs.
463 * Stack the second half for later processing,
464 * and set about producing the first half now.
465 */
466 while (runs > 2) {
467 ++level;
468 ++stackp;
469 stackp->offset = offset;
470 runs -= stackp->runs = runs / 2;
471 }
472 /* We must construct a single run from 1 or 2 runs.
473 * All the original runs are in which[0] == base.
474 * The run we construct must end up in which[level&1].
475 */
476 iwhich = level & 1;
477 if (runs == 1) {
478 /* Constructing a single run from a single run.
479 * If it's where it belongs already, there's nothing to do.
480 * Otherwise, copy it to where it belongs.
481 * A run of 1 is either a singleton at level 0,
482 * or the second half of a split 3. In neither event
483 * is it necessary to set offset. It will be set by the merge
484 * that immediately follows.
485 */
486 if (iwhich) { /* Belongs in aux, currently in base */
487 f1 = b = PINDEX(base, offset); /* where list starts */
488 f2 = PINDEX(aux, offset); /* where list goes */
489 t = NEXT(f2); /* where list will end */
490 offset = PNELEM(aux, t); /* offset thereof */
491 t = PINDEX(base, offset); /* where it currently ends */
492 FROMTOUPTO(f1, f2, t); /* copy */
493 NEXT(b) = t; /* set up parallel pointer */
494 } else if (level == 0) goto done; /* single run at level 0 */
495 } else {
496 /* Constructing a single run from two runs.
497 * The merge code at the top will do that.
498 * We need only make sure the two runs are in the "other" array,
499 * so they'll end up in the correct array after the merge.
500 */
501 ++level;
502 ++stackp;
503 stackp->offset = offset;
504 stackp->runs = 0; /* take care of both runs, trigger merge */
505 if (!iwhich) { /* Merged runs belong in aux, copy 1st */
506 f1 = b = PINDEX(base, offset); /* where first run starts */
507 f2 = PINDEX(aux, offset); /* where it will be copied */
508 t = NEXT(f2); /* where first run will end */
509 offset = PNELEM(aux, t); /* offset thereof */
510 p = PINDEX(base, offset); /* end of first run */
511 t = NEXT(t); /* where second run will end */
512 t = PINDEX(base, PNELEM(aux, t)); /* where it now ends */
513 FROMTOUPTO(f1, f2, t); /* copy both runs */
514 NEXT(b) = p; /* paralled pointer for 1st */
515 NEXT(p) = t; /* ... and for second */
516 }
517 }
518 }
519 done:
520 if (aux != small) Safefree(aux); /* free iff allocated */
521 return;
522 }
523