1 
2 /*-------------------------------------------------------------*/
3 /*--- Block sorting machinery                               ---*/
4 /*---                                           blocksort.c ---*/
5 /*-------------------------------------------------------------*/
6 
7 /* ------------------------------------------------------------------
8    This file is part of bzip2/libbzip2, a program and library for
9    lossless, block-sorting data compression.
10 
11    bzip2/libbzip2 version 1.0.5 of 10 December 2007
12    Copyright (C) 1996-2007 Julian Seward <jseward@bzip.org>
13 
14    Please read the WARNING, DISCLAIMER and PATENTS sections in the
15    README file.
16 
17    This program is released under the terms of the license contained
18    in the file LICENSE.
19    ------------------------------------------------------------------ */
20 
21 
22 #include "bzlib_private.h"
23 
24 ABC_NAMESPACE_IMPL_START
25 
26 /*---------------------------------------------*/
27 /*--- Fallback O(N log(N)^2) sorting        ---*/
28 /*--- algorithm, for repetitive blocks      ---*/
29 /*---------------------------------------------*/
30 
31 /*---------------------------------------------*/
32 static
33 __inline__
fallbackSimpleSort(UInt32 * fmap,UInt32 * eclass,Int32 lo,Int32 hi)34 void fallbackSimpleSort ( UInt32* fmap,
35                           UInt32* eclass,
36                           Int32   lo,
37                           Int32   hi )
38 {
39    Int32 i, j, tmp;
40    UInt32 ec_tmp;
41 
42    if (lo == hi) return;
43 
44    if (hi - lo > 3) {
45       for ( i = hi-4; i >= lo; i-- ) {
46          tmp = fmap[i];
47          ec_tmp = eclass[tmp];
48          for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
49             fmap[j-4] = fmap[j];
50          fmap[j-4] = tmp;
51       }
52    }
53 
54    for ( i = hi-1; i >= lo; i-- ) {
55       tmp = fmap[i];
56       ec_tmp = eclass[tmp];
57       for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
58          fmap[j-1] = fmap[j];
59       fmap[j-1] = tmp;
60    }
61 }
62 
63 
64 /*---------------------------------------------*/
65 #define fswap(zz1, zz2) \
66    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
67 
68 #define fvswap(zzp1, zzp2, zzn)       \
69 {                                     \
70    Int32 yyp1 = (zzp1);               \
71    Int32 yyp2 = (zzp2);               \
72    Int32 yyn  = (zzn);                \
73    while (yyn > 0) {                  \
74       fswap(fmap[yyp1], fmap[yyp2]);  \
75       yyp1++; yyp2++; yyn--;          \
76    }                                  \
77 }
78 
79 
80 #define fmin(a,b) ((a) < (b)) ? (a) : (b)
81 
82 #define fpush(lz,hz) { stackLo[sp] = lz; \
83                        stackHi[sp] = hz; \
84                        sp++; }
85 
86 #define fpop(lz,hz) { sp--;              \
87                       lz = stackLo[sp];  \
88                       hz = stackHi[sp]; }
89 
90 #define FALLBACK_QSORT_SMALL_THRESH 10
91 #define FALLBACK_QSORT_STACK_SIZE   100
92 
93 
94 static
fallbackQSort3(UInt32 * fmap,UInt32 * eclass,Int32 loSt,Int32 hiSt)95 void fallbackQSort3 ( UInt32* fmap,
96                       UInt32* eclass,
97                       Int32   loSt,
98                       Int32   hiSt )
99 {
100    Int32 unLo, unHi, ltLo, gtHi, n, m;
101    Int32 sp, lo, hi;
102    UInt32 med, r, r3;
103    Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
104    Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
105 
106    r = 0;
107 
108    sp = 0;
109    fpush ( loSt, hiSt );
110 
111    while (sp > 0) {
112 
113       AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 );
114 
115       fpop ( lo, hi );
116       if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
117          fallbackSimpleSort ( fmap, eclass, lo, hi );
118          continue;
119       }
120 
121       /* Random partitioning.  Median of 3 sometimes fails to
122          avoid bad cases.  Median of 9 seems to help but
123          looks rather expensive.  This too seems to work but
124          is cheaper.  Guidance for the magic constants
125          7621 and 32768 is taken from Sedgewick's algorithms
126          book, chapter 35.
127       */
128       r = ((r * 7621) + 1) % 32768;
129       r3 = r % 3;
130       if (r3 == 0) med = eclass[fmap[lo]]; else
131       if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
132                    med = eclass[fmap[hi]];
133 
134       unLo = ltLo = lo;
135       unHi = gtHi = hi;
136 
137       while (1) {
138          while (1) {
139             if (unLo > unHi) break;
140             n = (Int32)eclass[fmap[unLo]] - (Int32)med;
141             if (n == 0) {
142                fswap(fmap[unLo], fmap[ltLo]);
143                ltLo++; unLo++;
144                continue;
145             };
146             if (n > 0) break;
147             unLo++;
148          }
149          while (1) {
150             if (unLo > unHi) break;
151             n = (Int32)eclass[fmap[unHi]] - (Int32)med;
152             if (n == 0) {
153                fswap(fmap[unHi], fmap[gtHi]);
154                gtHi--; unHi--;
155                continue;
156             };
157             if (n < 0) break;
158             unHi--;
159          }
160          if (unLo > unHi) break;
161          fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
162       }
163 
164       AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
165 
166       if (gtHi < ltLo) continue;
167 
168       n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
169       m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
170 
171       n = lo + unLo - ltLo - 1;
172       m = hi - (gtHi - unHi) + 1;
173 
174       if (n - lo > hi - m) {
175          fpush ( lo, n );
176          fpush ( m, hi );
177       } else {
178          fpush ( m, hi );
179          fpush ( lo, n );
180       }
181    }
182 }
183 
184 #undef fmin
185 #undef fpush
186 #undef fpop
187 #undef fswap
188 #undef fvswap
189 #undef FALLBACK_QSORT_SMALL_THRESH
190 #undef FALLBACK_QSORT_STACK_SIZE
191 
192 
193 /*---------------------------------------------*/
194 /* Pre:
195       nblock > 0
196       eclass exists for [0 .. nblock-1]
197       ((UChar*)eclass) [0 .. nblock-1] holds block
198       ptr exists for [0 .. nblock-1]
199 
200    Post:
201       ((UChar*)eclass) [0 .. nblock-1] holds block
202       All other areas of eclass destroyed
203       fmap [0 .. nblock-1] holds sorted order
204       bhtab [ 0 .. 2+(nblock/32) ] destroyed
205 */
206 
207 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
208 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
209 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
210 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
211 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
212 
213 static
fallbackSort(UInt32 * fmap,UInt32 * eclass,UInt32 * bhtab,Int32 nblock,Int32 verb)214 void fallbackSort ( UInt32* fmap,
215                     UInt32* eclass,
216                     UInt32* bhtab,
217                     Int32   nblock,
218                     Int32   verb )
219 {
220    Int32 ftab[257];
221    Int32 ftabCopy[256];
222    Int32 H, i, j, k, l, r, cc, cc1;
223    Int32 nNotDone;
224    Int32 nBhtab;
225    UChar* eclass8 = (UChar*)eclass;
226 
227    /*--
228       Initial 1-char radix sort to generate
229       initial fmap and initial BH bits.
230    --*/
231    if (verb >= 4)
232       VPrintf0 ( "        bucket sorting ...\n" );
233    for (i = 0; i < 257;    i++) ftab[i] = 0;
234    for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
235    for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
236    for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
237 
238    for (i = 0; i < nblock; i++) {
239       j = eclass8[i];
240       k = ftab[j] - 1;
241       ftab[j] = k;
242       fmap[k] = i;
243    }
244 
245    nBhtab = 2 + (nblock / 32);
246    for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
247    for (i = 0; i < 256; i++) SET_BH(ftab[i]);
248 
249    /*--
250       Inductively refine the buckets.  Kind-of an
251       "exponential radix sort" (!), inspired by the
252       Manber-Myers suffix array construction algorithm.
253    --*/
254 
255    /*-- set sentinel bits for block-end detection --*/
256    for (i = 0; i < 32; i++) {
257       SET_BH(nblock + 2*i);
258       CLEAR_BH(nblock + 2*i + 1);
259    }
260 
261    /*-- the log(N) loop --*/
262    H = 1;
263    while (1) {
264 
265       if (verb >= 4)
266          VPrintf1 ( "        depth %6d has ", H );
267 
268       j = 0;
269       for (i = 0; i < nblock; i++) {
270          if (ISSET_BH(i)) j = i;
271          k = fmap[i] - H; if (k < 0) k += nblock;
272          eclass[k] = j;
273       }
274 
275       nNotDone = 0;
276       r = -1;
277       while (1) {
278 
279      /*-- find the next non-singleton bucket --*/
280          k = r + 1;
281          while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
282          if (ISSET_BH(k)) {
283             while (WORD_BH(k) == 0xffffffff) k += 32;
284             while (ISSET_BH(k)) k++;
285          }
286          l = k - 1;
287          if (l >= nblock) break;
288          while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
289          if (!ISSET_BH(k)) {
290             while (WORD_BH(k) == 0x00000000) k += 32;
291             while (!ISSET_BH(k)) k++;
292          }
293          r = k - 1;
294          if (r >= nblock) break;
295 
296          /*-- now [l, r] bracket current bucket --*/
297          if (r > l) {
298             nNotDone += (r - l + 1);
299             fallbackQSort3 ( fmap, eclass, l, r );
300 
301             /*-- scan bucket and generate header bits-- */
302             cc = -1;
303             for (i = l; i <= r; i++) {
304                cc1 = eclass[fmap[i]];
305                if (cc != cc1) { SET_BH(i); cc = cc1; };
306             }
307          }
308       }
309 
310       if (verb >= 4)
311          VPrintf1 ( "%6d unresolved strings\n", nNotDone );
312 
313       H *= 2;
314       if (H > nblock || nNotDone == 0) break;
315    }
316 
317    /*--
318       Reconstruct the original block in
319       eclass8 [0 .. nblock-1], since the
320       previous phase destroyed it.
321    --*/
322    if (verb >= 4)
323       VPrintf0 ( "        reconstructing block ...\n" );
324    j = 0;
325    for (i = 0; i < nblock; i++) {
326       while (ftabCopy[j] == 0) j++;
327       ftabCopy[j]--;
328       eclass8[fmap[i]] = (UChar)j;
329    }
330    AssertH ( j < 256, 1005 );
331 }
332 
333 #undef       SET_BH
334 #undef     CLEAR_BH
335 #undef     ISSET_BH
336 #undef      WORD_BH
337 #undef UNALIGNED_BH
338 
339 
340 /*---------------------------------------------*/
341 /*--- The main, O(N^2 log(N)) sorting       ---*/
342 /*--- algorithm.  Faster for "normal"       ---*/
343 /*--- non-repetitive blocks.                ---*/
344 /*---------------------------------------------*/
345 
346 /*---------------------------------------------*/
347 static
348 __inline__
mainGtU(UInt32 i1,UInt32 i2,UChar * block,UInt16 * quadrant,UInt32 nblock,Int32 * budget)349 Bool mainGtU ( UInt32  i1,
350                UInt32  i2,
351                UChar*  block,
352                UInt16* quadrant,
353                UInt32  nblock,
354                Int32*  budget )
355 {
356    Int32  k;
357    UChar  c1, c2;
358    UInt16 s1, s2;
359 
360    AssertD ( i1 != i2, "mainGtU" );
361    /* 1 */
362    c1 = block[i1]; c2 = block[i2];
363    if (c1 != c2) return (c1 > c2);
364    i1++; i2++;
365    /* 2 */
366    c1 = block[i1]; c2 = block[i2];
367    if (c1 != c2) return (c1 > c2);
368    i1++; i2++;
369    /* 3 */
370    c1 = block[i1]; c2 = block[i2];
371    if (c1 != c2) return (c1 > c2);
372    i1++; i2++;
373    /* 4 */
374    c1 = block[i1]; c2 = block[i2];
375    if (c1 != c2) return (c1 > c2);
376    i1++; i2++;
377    /* 5 */
378    c1 = block[i1]; c2 = block[i2];
379    if (c1 != c2) return (c1 > c2);
380    i1++; i2++;
381    /* 6 */
382    c1 = block[i1]; c2 = block[i2];
383    if (c1 != c2) return (c1 > c2);
384    i1++; i2++;
385    /* 7 */
386    c1 = block[i1]; c2 = block[i2];
387    if (c1 != c2) return (c1 > c2);
388    i1++; i2++;
389    /* 8 */
390    c1 = block[i1]; c2 = block[i2];
391    if (c1 != c2) return (c1 > c2);
392    i1++; i2++;
393    /* 9 */
394    c1 = block[i1]; c2 = block[i2];
395    if (c1 != c2) return (c1 > c2);
396    i1++; i2++;
397    /* 10 */
398    c1 = block[i1]; c2 = block[i2];
399    if (c1 != c2) return (c1 > c2);
400    i1++; i2++;
401    /* 11 */
402    c1 = block[i1]; c2 = block[i2];
403    if (c1 != c2) return (c1 > c2);
404    i1++; i2++;
405    /* 12 */
406    c1 = block[i1]; c2 = block[i2];
407    if (c1 != c2) return (c1 > c2);
408    i1++; i2++;
409 
410    k = nblock + 8;
411 
412    do {
413       /* 1 */
414       c1 = block[i1]; c2 = block[i2];
415       if (c1 != c2) return (c1 > c2);
416       s1 = quadrant[i1]; s2 = quadrant[i2];
417       if (s1 != s2) return (s1 > s2);
418       i1++; i2++;
419       /* 2 */
420       c1 = block[i1]; c2 = block[i2];
421       if (c1 != c2) return (c1 > c2);
422       s1 = quadrant[i1]; s2 = quadrant[i2];
423       if (s1 != s2) return (s1 > s2);
424       i1++; i2++;
425       /* 3 */
426       c1 = block[i1]; c2 = block[i2];
427       if (c1 != c2) return (c1 > c2);
428       s1 = quadrant[i1]; s2 = quadrant[i2];
429       if (s1 != s2) return (s1 > s2);
430       i1++; i2++;
431       /* 4 */
432       c1 = block[i1]; c2 = block[i2];
433       if (c1 != c2) return (c1 > c2);
434       s1 = quadrant[i1]; s2 = quadrant[i2];
435       if (s1 != s2) return (s1 > s2);
436       i1++; i2++;
437       /* 5 */
438       c1 = block[i1]; c2 = block[i2];
439       if (c1 != c2) return (c1 > c2);
440       s1 = quadrant[i1]; s2 = quadrant[i2];
441       if (s1 != s2) return (s1 > s2);
442       i1++; i2++;
443       /* 6 */
444       c1 = block[i1]; c2 = block[i2];
445       if (c1 != c2) return (c1 > c2);
446       s1 = quadrant[i1]; s2 = quadrant[i2];
447       if (s1 != s2) return (s1 > s2);
448       i1++; i2++;
449       /* 7 */
450       c1 = block[i1]; c2 = block[i2];
451       if (c1 != c2) return (c1 > c2);
452       s1 = quadrant[i1]; s2 = quadrant[i2];
453       if (s1 != s2) return (s1 > s2);
454       i1++; i2++;
455       /* 8 */
456       c1 = block[i1]; c2 = block[i2];
457       if (c1 != c2) return (c1 > c2);
458       s1 = quadrant[i1]; s2 = quadrant[i2];
459       if (s1 != s2) return (s1 > s2);
460       i1++; i2++;
461 
462       if (i1 >= nblock) i1 -= nblock;
463       if (i2 >= nblock) i2 -= nblock;
464 
465       k -= 8;
466       (*budget)--;
467    }
468       while (k >= 0);
469 
470    return False;
471 }
472 
473 
474 /*---------------------------------------------*/
475 /*--
476    Knuth's increments seem to work better
477    than Incerpi-Sedgewick here.  Possibly
478    because the number of elems to sort is
479    usually small, typically <= 20.
480 --*/
481 static
482 Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
483                    9841, 29524, 88573, 265720,
484                    797161, 2391484 };
485 
486 static
mainSimpleSort(UInt32 * ptr,UChar * block,UInt16 * quadrant,Int32 nblock,Int32 lo,Int32 hi,Int32 d,Int32 * budget)487 void mainSimpleSort ( UInt32* ptr,
488                       UChar*  block,
489                       UInt16* quadrant,
490                       Int32   nblock,
491                       Int32   lo,
492                       Int32   hi,
493                       Int32   d,
494                       Int32*  budget )
495 {
496    Int32 i, j, h, bigN, hp;
497    UInt32 v;
498 
499    bigN = hi - lo + 1;
500    if (bigN < 2) return;
501 
502    hp = 0;
503    while (incs[hp] < bigN) hp++;
504    hp--;
505 
506    for (; hp >= 0; hp--) {
507       h = incs[hp];
508 
509       i = lo + h;
510       while (True) {
511 
512          /*-- copy 1 --*/
513          if (i > hi) break;
514          v = ptr[i];
515          j = i;
516          while ( mainGtU (
517                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
518                  ) ) {
519             ptr[j] = ptr[j-h];
520             j = j - h;
521             if (j <= (lo + h - 1)) break;
522          }
523          ptr[j] = v;
524          i++;
525 
526          /*-- copy 2 --*/
527          if (i > hi) break;
528          v = ptr[i];
529          j = i;
530          while ( mainGtU (
531                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
532                  ) ) {
533             ptr[j] = ptr[j-h];
534             j = j - h;
535             if (j <= (lo + h - 1)) break;
536          }
537          ptr[j] = v;
538          i++;
539 
540          /*-- copy 3 --*/
541          if (i > hi) break;
542          v = ptr[i];
543          j = i;
544          while ( mainGtU (
545                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
546                  ) ) {
547             ptr[j] = ptr[j-h];
548             j = j - h;
549             if (j <= (lo + h - 1)) break;
550          }
551          ptr[j] = v;
552          i++;
553 
554          if (*budget < 0) return;
555       }
556    }
557 }
558 
559 
560 /*---------------------------------------------*/
561 /*--
562    The following is an implementation of
563    an elegant 3-way quicksort for strings,
564    described in a paper "Fast Algorithms for
565    Sorting and Searching Strings", by Robert
566    Sedgewick and Jon L. Bentley.
567 --*/
568 
569 #define mswap(zz1, zz2) \
570    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
571 
572 #define mvswap(zzp1, zzp2, zzn)       \
573 {                                     \
574    Int32 yyp1 = (zzp1);               \
575    Int32 yyp2 = (zzp2);               \
576    Int32 yyn  = (zzn);                \
577    while (yyn > 0) {                  \
578       mswap(ptr[yyp1], ptr[yyp2]);    \
579       yyp1++; yyp2++; yyn--;          \
580    }                                  \
581 }
582 
583 static
584 __inline__
mmed3(UChar a,UChar b,UChar c)585 UChar mmed3 ( UChar a, UChar b, UChar c )
586 {
587    UChar t;
588    if (a > b) { t = a; a = b; b = t; };
589    if (b > c) {
590       b = c;
591       if (a > b) b = a;
592    }
593    return b;
594 }
595 
596 #define mmin(a,b) ((a) < (b)) ? (a) : (b)
597 
598 #define mpush(lz,hz,dz) { stackLo[sp] = lz; \
599                           stackHi[sp] = hz; \
600                           stackD [sp] = dz; \
601                           sp++; }
602 
603 #define mpop(lz,hz,dz) { sp--;             \
604                          lz = stackLo[sp]; \
605                          hz = stackHi[sp]; \
606                          dz = stackD [sp]; }
607 
608 
609 #define mnextsize(az) (nextHi[az]-nextLo[az])
610 
611 #define mnextswap(az,bz)                                        \
612    { Int32 tz;                                                  \
613      tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
614      tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
615      tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
616 
617 
618 #define MAIN_QSORT_SMALL_THRESH 20
619 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
620 #define MAIN_QSORT_STACK_SIZE 100
621 
622 static
mainQSort3(UInt32 * ptr,UChar * block,UInt16 * quadrant,Int32 nblock,Int32 loSt,Int32 hiSt,Int32 dSt,Int32 * budget)623 void mainQSort3 ( UInt32* ptr,
624                   UChar*  block,
625                   UInt16* quadrant,
626                   Int32   nblock,
627                   Int32   loSt,
628                   Int32   hiSt,
629                   Int32   dSt,
630                   Int32*  budget )
631 {
632    Int32 unLo, unHi, ltLo, gtHi, n, m, med;
633    Int32 sp, lo, hi, d;
634 
635    Int32 stackLo[MAIN_QSORT_STACK_SIZE];
636    Int32 stackHi[MAIN_QSORT_STACK_SIZE];
637    Int32 stackD [MAIN_QSORT_STACK_SIZE];
638 
639    Int32 nextLo[3];
640    Int32 nextHi[3];
641    Int32 nextD [3];
642 
643    sp = 0;
644    mpush ( loSt, hiSt, dSt );
645 
646    while (sp > 0) {
647 
648       AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 );
649 
650       mpop ( lo, hi, d );
651       if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
652           d > MAIN_QSORT_DEPTH_THRESH) {
653          mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
654          if (*budget < 0) return;
655          continue;
656       }
657 
658       med = (Int32)
659             mmed3 ( block[ptr[ lo         ]+d],
660                     block[ptr[ hi         ]+d],
661                     block[ptr[ (lo+hi)>>1 ]+d] );
662 
663       unLo = ltLo = lo;
664       unHi = gtHi = hi;
665 
666       while (True) {
667          while (True) {
668             if (unLo > unHi) break;
669             n = ((Int32)block[ptr[unLo]+d]) - med;
670             if (n == 0) {
671                mswap(ptr[unLo], ptr[ltLo]);
672                ltLo++; unLo++; continue;
673             };
674             if (n >  0) break;
675             unLo++;
676          }
677          while (True) {
678             if (unLo > unHi) break;
679             n = ((Int32)block[ptr[unHi]+d]) - med;
680             if (n == 0) {
681                mswap(ptr[unHi], ptr[gtHi]);
682                gtHi--; unHi--; continue;
683             };
684             if (n <  0) break;
685             unHi--;
686          }
687          if (unLo > unHi) break;
688          mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
689       }
690 
691       AssertD ( unHi == unLo-1, "mainQSort3(2)" );
692 
693       if (gtHi < ltLo) {
694          mpush(lo, hi, d+1 );
695          continue;
696       }
697 
698       n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
699       m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
700 
701       n = lo + unLo - ltLo - 1;
702       m = hi - (gtHi - unHi) + 1;
703 
704       nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
705       nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
706       nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
707 
708       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
709       if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
710       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
711 
712       AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
713       AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
714 
715       mpush (nextLo[0], nextHi[0], nextD[0]);
716       mpush (nextLo[1], nextHi[1], nextD[1]);
717       mpush (nextLo[2], nextHi[2], nextD[2]);
718    }
719 }
720 
721 #undef mswap
722 #undef mvswap
723 #undef mpush
724 #undef mpop
725 #undef mmin
726 #undef mnextsize
727 #undef mnextswap
728 #undef MAIN_QSORT_SMALL_THRESH
729 #undef MAIN_QSORT_DEPTH_THRESH
730 #undef MAIN_QSORT_STACK_SIZE
731 
732 
733 /*---------------------------------------------*/
734 /* Pre:
735       nblock > N_OVERSHOOT
736       block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
737       ((UChar*)block32) [0 .. nblock-1] holds block
738       ptr exists for [0 .. nblock-1]
739 
740    Post:
741       ((UChar*)block32) [0 .. nblock-1] holds block
742       All other areas of block32 destroyed
743       ftab [0 .. 65536 ] destroyed
744       ptr [0 .. nblock-1] holds sorted order
745       if (*budget < 0), sorting was abandoned
746 */
747 
748 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
749 #define SETMASK (1 << 21)
750 #define CLEARMASK (~(SETMASK))
751 
752 static
mainSort(UInt32 * ptr,UChar * block,UInt16 * quadrant,UInt32 * ftab,Int32 nblock,Int32 verb,Int32 * budget)753 void mainSort ( UInt32* ptr,
754                 UChar*  block,
755                 UInt16* quadrant,
756                 UInt32* ftab,
757                 Int32   nblock,
758                 Int32   verb,
759                 Int32*  budget )
760 {
761    Int32  i, j, k, ss, sb;
762    Int32  runningOrder[256];
763    Bool   bigDone[256];
764    Int32  copyStart[256];
765    Int32  copyEnd  [256];
766    UChar  c1;
767    Int32  numQSorted;
768    UInt16 s;
769    if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
770 
771    /*-- set up the 2-byte frequency table --*/
772    for (i = 65536; i >= 0; i--) ftab[i] = 0;
773 
774    j = block[0] << 8;
775    i = nblock-1;
776    for (; i >= 3; i -= 4) {
777       quadrant[i] = 0;
778       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
779       ftab[j]++;
780       quadrant[i-1] = 0;
781       j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
782       ftab[j]++;
783       quadrant[i-2] = 0;
784       j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
785       ftab[j]++;
786       quadrant[i-3] = 0;
787       j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
788       ftab[j]++;
789    }
790    for (; i >= 0; i--) {
791       quadrant[i] = 0;
792       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
793       ftab[j]++;
794    }
795 
796    /*-- (emphasises close relationship of block & quadrant) --*/
797    for (i = 0; i < BZ_N_OVERSHOOT; i++) {
798       block   [nblock+i] = block[i];
799       quadrant[nblock+i] = 0;
800    }
801 
802    if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
803 
804    /*-- Complete the initial radix sort --*/
805    for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
806 
807    s = block[0] << 8;
808    i = nblock-1;
809    for (; i >= 3; i -= 4) {
810       s = (s >> 8) | (block[i] << 8);
811       j = ftab[s] -1;
812       ftab[s] = j;
813       ptr[j] = i;
814       s = (s >> 8) | (block[i-1] << 8);
815       j = ftab[s] -1;
816       ftab[s] = j;
817       ptr[j] = i-1;
818       s = (s >> 8) | (block[i-2] << 8);
819       j = ftab[s] -1;
820       ftab[s] = j;
821       ptr[j] = i-2;
822       s = (s >> 8) | (block[i-3] << 8);
823       j = ftab[s] -1;
824       ftab[s] = j;
825       ptr[j] = i-3;
826    }
827    for (; i >= 0; i--) {
828       s = (s >> 8) | (block[i] << 8);
829       j = ftab[s] -1;
830       ftab[s] = j;
831       ptr[j] = i;
832    }
833 
834    /*--
835       Now ftab contains the first loc of every small bucket.
836       Calculate the running order, from smallest to largest
837       big bucket.
838    --*/
839    for (i = 0; i <= 255; i++) {
840       bigDone     [i] = False;
841       runningOrder[i] = i;
842    }
843 
844    {
845       Int32 vv;
846       Int32 h = 1;
847       do h = 3 * h + 1; while (h <= 256);
848       do {
849          h = h / 3;
850          for (i = h; i <= 255; i++) {
851             vv = runningOrder[i];
852             j = i;
853             while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
854                runningOrder[j] = runningOrder[j-h];
855                j = j - h;
856                if (j <= (h - 1)) goto zero;
857             }
858             zero:
859             runningOrder[j] = vv;
860          }
861       } while (h != 1);
862    }
863 
864    /*--
865       The main sorting loop.
866    --*/
867 
868    numQSorted = 0;
869 
870    for (i = 0; i <= 255; i++) {
871 
872       /*--
873          Process big buckets, starting with the least full.
874          Basically this is a 3-step process in which we call
875          mainQSort3 to sort the small buckets [ss, j], but
876          also make a big effort to avoid the calls if we can.
877       --*/
878       ss = runningOrder[i];
879 
880       /*--
881          Step 1:
882          Complete the big bucket [ss] by quicksorting
883          any unsorted small buckets [ss, j], for j != ss.
884          Hopefully previous pointer-scanning phases have already
885          completed many of the small buckets [ss, j], so
886          we don't have to sort them at all.
887       --*/
888       for (j = 0; j <= 255; j++) {
889          if (j != ss) {
890             sb = (ss << 8) + j;
891             if ( ! (ftab[sb] & SETMASK) ) {
892                Int32 lo = ftab[sb]   & CLEARMASK;
893                Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
894                if (hi > lo) {
895                   if (verb >= 4)
896                      VPrintf4 ( "        qsort [0x%x, 0x%x]   "
897                                 "done %d   this %d\n",
898                                 ss, j, numQSorted, hi - lo + 1 );
899                   mainQSort3 (
900                      ptr, block, quadrant, nblock,
901                      lo, hi, BZ_N_RADIX, budget
902                   );
903                   numQSorted += (hi - lo + 1);
904                   if (*budget < 0) return;
905                }
906             }
907             ftab[sb] |= SETMASK;
908          }
909       }
910 
911       AssertH ( !bigDone[ss], 1006 );
912 
913       /*--
914          Step 2:
915          Now scan this big bucket [ss] so as to synthesise the
916          sorted order for small buckets [t, ss] for all t,
917          including, magically, the bucket [ss,ss] too.
918          This will avoid doing Real Work in subsequent Step 1's.
919       --*/
920       {
921          for (j = 0; j <= 255; j++) {
922             copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
923             copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
924          }
925          for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
926             k = ptr[j]-1; if (k < 0) k += nblock;
927             c1 = block[k];
928             if (!bigDone[c1])
929                ptr[ copyStart[c1]++ ] = k;
930          }
931          for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
932             k = ptr[j]-1; if (k < 0) k += nblock;
933             c1 = block[k];
934             if (!bigDone[c1])
935                ptr[ copyEnd[c1]-- ] = k;
936          }
937       }
938 
939       AssertH ( (copyStart[ss]-1 == copyEnd[ss])
940                 ||
941                 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
942                    Necessity for this case is demonstrated by compressing
943                    a sequence of approximately 48.5 million of character
944                    251; 1.0.0/1.0.1 will then die here. */
945                 (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
946                 1007 )
947 
948       for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
949 
950       /*--
951          Step 3:
952          The [ss] big bucket is now done.  Record this fact,
953          and update the quadrant descriptors.  Remember to
954          update quadrants in the overshoot area too, if
955          necessary.  The "if (i < 255)" test merely skips
956          this updating for the last bucket processed, since
957          updating for the last bucket is pointless.
958 
959          The quadrant array provides a way to incrementally
960          cache sort orderings, as they appear, so as to
961          make subsequent comparisons in fullGtU() complete
962          faster.  For repetitive blocks this makes a big
963          difference (but not big enough to be able to avoid
964          the fallback sorting mechanism, exponential radix sort).
965 
966          The precise meaning is: at all times:
967 
968             for 0 <= i < nblock and 0 <= j <= nblock
969 
970             if block[i] != block[j],
971 
972                then the relative values of quadrant[i] and
973                     quadrant[j] are meaningless.
974 
975                else {
976                   if quadrant[i] < quadrant[j]
977                      then the string starting at i lexicographically
978                      precedes the string starting at j
979 
980                   else if quadrant[i] > quadrant[j]
981                      then the string starting at j lexicographically
982                      precedes the string starting at i
983 
984                   else
985                      the relative ordering of the strings starting
986                      at i and j has not yet been determined.
987                }
988       --*/
989       bigDone[ss] = True;
990 
991       if (i < 255) {
992          Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
993          Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
994          Int32 shifts   = 0;
995 
996          while ((bbSize >> shifts) > 65534) shifts++;
997 
998          for (j = bbSize-1; j >= 0; j--) {
999             Int32 a2update     = ptr[bbStart + j];
1000             UInt16 qVal        = (UInt16)(j >> shifts);
1001             quadrant[a2update] = qVal;
1002             if (a2update < BZ_N_OVERSHOOT)
1003                quadrant[a2update + nblock] = qVal;
1004          }
1005          AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
1006       }
1007 
1008    }
1009 
1010    if (verb >= 4)
1011       VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
1012                  nblock, numQSorted, nblock - numQSorted );
1013 }
1014 
1015 #undef BIGFREQ
1016 #undef SETMASK
1017 #undef CLEARMASK
1018 
1019 
1020 /*---------------------------------------------*/
1021 /* Pre:
1022       nblock > 0
1023       arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1024       ((UChar*)arr2)  [0 .. nblock-1] holds block
1025       arr1 exists for [0 .. nblock-1]
1026 
1027    Post:
1028       ((UChar*)arr2) [0 .. nblock-1] holds block
1029       All other areas of block destroyed
1030       ftab [ 0 .. 65536 ] destroyed
1031       arr1 [0 .. nblock-1] holds sorted order
1032 */
BZ2_blockSort(EState * s)1033 void BZ2_blockSort ( EState* s )
1034 {
1035    UInt32* ptr    = s->ptr;
1036    UChar*  block  = s->block;
1037    UInt32* ftab   = s->ftab;
1038    Int32   nblock = s->nblock;
1039    Int32   verb   = s->verbosity;
1040    Int32   wfact  = s->workFactor;
1041    UInt16* quadrant;
1042    Int32   budget;
1043    Int32   budgetInit;
1044    Int32   i;
1045 
1046    if (nblock < 10000) {
1047       fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1048    } else {
1049       /* Calculate the location for quadrant, remembering to get
1050          the alignment right.  Assumes that &(block[0]) is at least
1051          2-byte aligned -- this should be ok since block is really
1052          the first section of arr2.
1053       */
1054       i = nblock+BZ_N_OVERSHOOT;
1055       if (i & 1) i++;
1056       quadrant = (UInt16*)(&(block[i]));
1057 
1058       /* (wfact-1) / 3 puts the default-factor-30
1059          transition point at very roughly the same place as
1060          with v0.1 and v0.9.0.
1061          Not that it particularly matters any more, since the
1062          resulting compressed stream is now the same regardless
1063          of whether or not we use the main sort or fallback sort.
1064       */
1065       if (wfact < 1  ) wfact = 1;
1066       if (wfact > 100) wfact = 100;
1067       budgetInit = nblock * ((wfact-1) / 3);
1068       budget = budgetInit;
1069 
1070       mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1071       if (verb >= 3)
1072          VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
1073                     budgetInit - budget,
1074                     nblock,
1075                     (float)(budgetInit - budget) /
1076                     (float)(nblock==0 ? 1 : nblock) );
1077       if (budget < 0) {
1078          if (verb >= 2)
1079             VPrintf0 ( "    too repetitive; using fallback"
1080                        " sorting algorithm\n" );
1081          fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1082       }
1083    }
1084 
1085    s->origPtr = -1;
1086    for (i = 0; i < s->nblock; i++)
1087       if (ptr[i] == 0)
1088          { s->origPtr = i; break; };
1089 
1090    AssertH( s->origPtr != -1, 1003 );
1091 }
1092 
1093 
1094 /*-------------------------------------------------------------*/
1095 /*--- end                                       blocksort.c ---*/
1096 /*-------------------------------------------------------------*/
1097 
1098 ABC_NAMESPACE_IMPL_END
1099