1 
2 /*-------------------------------------------------------------*/
3 /*--- Block sorting machinery                               ---*/
4 /*---                                           blocksort.c ---*/
5 /*-------------------------------------------------------------*/
6 
7 /* ------------------------------------------------------------------
8    This file is part of bzip2/libbzip2, a program and library for
9    lossless, block-sorting data compression.
10 
11    bzip2/libbzip2 version 1.0.5 of 10 December 2007
12    Copyright (C) 1996-2007 Julian Seward <jseward@bzip.org>
13 
14    Please read the WARNING, DISCLAIMER and PATENTS sections in the
15    README file.
16 
17    This program is released under the terms of the license contained
18    in the file LICENSE.
19    ------------------------------------------------------------------ */
20 
21 #include "bzlib_private.h"
22 
23 /*---------------------------------------------*/
24 /*--- Fallback O(N log(N)^2) sorting        ---*/
25 /*--- algorithm, for repetitive blocks      ---*/
26 /*---------------------------------------------*/
27 
28 /*---------------------------------------------*/
29 static
30 __inline__
fallbackSimpleSort(UInt32 * fmap,UInt32 * eclass,Int32 lo,Int32 hi)31 void fallbackSimpleSort ( UInt32* fmap,
32                           UInt32* eclass,
33                           Int32   lo,
34                           Int32   hi )
35 {
36    Int32 i, j, tmp;
37    UInt32 ec_tmp;
38 
39    if (lo == hi) return;
40 
41    if (hi - lo > 3) {
42       for ( i = hi-4; i >= lo; i-- ) {
43          tmp = fmap[i];
44          ec_tmp = eclass[tmp];
45          for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
46             fmap[j-4] = fmap[j];
47          fmap[j-4] = tmp;
48       }
49    }
50 
51    for ( i = hi-1; i >= lo; i-- ) {
52       tmp = fmap[i];
53       ec_tmp = eclass[tmp];
54       for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
55          fmap[j-1] = fmap[j];
56       fmap[j-1] = tmp;
57    }
58 }
59 
60 /*---------------------------------------------*/
61 #define fswap(zz1, zz2) \
62    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
63 
64 #define fvswap(zzp1, zzp2, zzn)       \
65 {                                     \
66    Int32 yyp1 = (zzp1);               \
67    Int32 yyp2 = (zzp2);               \
68    Int32 yyn  = (zzn);                \
69    while (yyn > 0) {                  \
70       fswap(fmap[yyp1], fmap[yyp2]);  \
71       yyp1++; yyp2++; yyn--;          \
72    }                                  \
73 }
74 
75 #define fmin(a,b) ((a) < (b)) ? (a) : (b)
76 
77 #define fpush(lz,hz) { stackLo[sp] = lz; \
78                        stackHi[sp] = hz; \
79                        sp++; }
80 
81 #define fpop(lz,hz) { sp--;              \
82                       lz = stackLo[sp];  \
83                       hz = stackHi[sp]; }
84 
85 #define FALLBACK_QSORT_SMALL_THRESH 10
86 #define FALLBACK_QSORT_STACK_SIZE   100
87 
88 
89 static
fallbackQSort3(UInt32 * fmap,UInt32 * eclass,Int32 loSt,Int32 hiSt)90 void fallbackQSort3 ( UInt32* fmap,
91                       UInt32* eclass,
92                       Int32   loSt,
93                       Int32   hiSt )
94 {
95    Int32 unLo, unHi, ltLo, gtHi, n, m;
96    Int32 sp, lo, hi;
97    UInt32 med, r, r3;
98    Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
99    Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
100 
101    r = 0;
102 
103    sp = 0;
104    fpush ( loSt, hiSt );
105 
106    while (sp > 0) {
107 
108       AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 );
109 
110       fpop ( lo, hi );
111       if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
112          fallbackSimpleSort ( fmap, eclass, lo, hi );
113          continue;
114       }
115 
116       /* Random partitioning.  Median of 3 sometimes fails to
117          avoid bad cases.  Median of 9 seems to help but
118          looks rather expensive.  This too seems to work but
119          is cheaper.  Guidance for the magic constants
120          7621 and 32768 is taken from Sedgewick's algorithms
121          book, chapter 35.
122       */
123       r = ((r * 7621) + 1) % 32768;
124       r3 = r % 3;
125       if (r3 == 0) med = eclass[fmap[lo]]; else
126       if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
127                    med = eclass[fmap[hi]];
128 
129       unLo = ltLo = lo;
130       unHi = gtHi = hi;
131 
132       while (1) {
133          while (1) {
134             if (unLo > unHi) break;
135             n = (Int32)eclass[fmap[unLo]] - (Int32)med;
136             if (n == 0) {
137                fswap(fmap[unLo], fmap[ltLo]);
138                ltLo++; unLo++;
139                continue;
140             };
141             if (n > 0) break;
142             unLo++;
143          }
144          while (1) {
145             if (unLo > unHi) break;
146             n = (Int32)eclass[fmap[unHi]] - (Int32)med;
147             if (n == 0) {
148                fswap(fmap[unHi], fmap[gtHi]);
149                gtHi--; unHi--;
150                continue;
151             };
152             if (n < 0) break;
153             unHi--;
154          }
155          if (unLo > unHi) break;
156          fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
157       }
158 
159       AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
160 
161       if (gtHi < ltLo) continue;
162 
163       n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
164       m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
165 
166       n = lo + unLo - ltLo - 1;
167       m = hi - (gtHi - unHi) + 1;
168 
169       if (n - lo > hi - m) {
170          fpush ( lo, n );
171          fpush ( m, hi );
172       } else {
173          fpush ( m, hi );
174          fpush ( lo, n );
175       }
176    }
177 }
178 
179 #undef fmin
180 #undef fpush
181 #undef fpop
182 #undef fswap
183 #undef fvswap
184 #undef FALLBACK_QSORT_SMALL_THRESH
185 #undef FALLBACK_QSORT_STACK_SIZE
186 
187 /*---------------------------------------------*/
188 /* Pre:
189       nblock > 0
190       eclass exists for [0 .. nblock-1]
191       ((UChar*)eclass) [0 .. nblock-1] holds block
192       ptr exists for [0 .. nblock-1]
193 
194    Post:
195       ((UChar*)eclass) [0 .. nblock-1] holds block
196       All other areas of eclass destroyed
197       fmap [0 .. nblock-1] holds sorted order
198       bhtab [ 0 .. 2+(nblock/32) ] destroyed
199 */
200 
201 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
202 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
203 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
204 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
205 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
206 
207 static
fallbackSort(UInt32 * fmap,UInt32 * eclass,UInt32 * bhtab,Int32 nblock,Int32 verb)208 void fallbackSort ( UInt32* fmap,
209                     UInt32* eclass,
210                     UInt32* bhtab,
211                     Int32   nblock,
212                     Int32   verb )
213 {
214    Int32 ftab[257];
215    Int32 ftabCopy[256];
216    Int32 H, i, j, k, l, r, cc, cc1;
217    Int32 nNotDone;
218    Int32 nBhtab;
219    UChar* eclass8 = (UChar*)eclass;
220 
221    /*--
222       Initial 1-char radix sort to generate
223       initial fmap and initial BH bits.
224    --*/
225    if (verb >= 4)
226       VPrintf0 ( "        bucket sorting ...\n" );
227    for (i = 0; i < 257;    i++) ftab[i] = 0;
228    for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
229    for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
230    for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
231 
232    for (i = 0; i < nblock; i++) {
233       j = eclass8[i];
234       k = ftab[j] - 1;
235       ftab[j] = k;
236       fmap[k] = i;
237    }
238 
239    nBhtab = 2 + (nblock / 32);
240    for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
241    for (i = 0; i < 256; i++) SET_BH(ftab[i]);
242 
243    /*--
244       Inductively refine the buckets.  Kind-of an
245       "exponential radix sort" (!), inspired by the
246       Manber-Myers suffix array construction algorithm.
247    --*/
248 
249    /*-- set sentinel bits for block-end detection --*/
250    for (i = 0; i < 32; i++) {
251       SET_BH(nblock + 2*i);
252       CLEAR_BH(nblock + 2*i + 1);
253    }
254 
255    /*-- the log(N) loop --*/
256    H = 1;
257    while (1) {
258 
259       if (verb >= 4)
260          VPrintf1 ( "        depth %6d has ", H );
261 
262       j = 0;
263       for (i = 0; i < nblock; i++) {
264          if (ISSET_BH(i)) j = i;
265          k = fmap[i] - H; if (k < 0) k += nblock;
266          eclass[k] = j;
267       }
268 
269       nNotDone = 0;
270       r = -1;
271       while (1) {
272 
273 	 /*-- find the next non-singleton bucket --*/
274          k = r + 1;
275          while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
276          if (ISSET_BH(k)) {
277             while (WORD_BH(k) == 0xffffffff) k += 32;
278             while (ISSET_BH(k)) k++;
279          }
280          l = k - 1;
281          if (l >= nblock) break;
282          while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
283          if (!ISSET_BH(k)) {
284             while (WORD_BH(k) == 0x00000000) k += 32;
285             while (!ISSET_BH(k)) k++;
286          }
287          r = k - 1;
288          if (r >= nblock) break;
289 
290          /*-- now [l, r] bracket current bucket --*/
291          if (r > l) {
292             nNotDone += (r - l + 1);
293             fallbackQSort3 ( fmap, eclass, l, r );
294 
295             /*-- scan bucket and generate header bits-- */
296             cc = -1;
297             for (i = l; i <= r; i++) {
298                cc1 = eclass[fmap[i]];
299                if (cc != cc1) { SET_BH(i); cc = cc1; };
300             }
301          }
302       }
303 
304       if (verb >= 4)
305          VPrintf1 ( "%6d unresolved strings\n", nNotDone );
306 
307       H *= 2;
308       if (H > nblock || nNotDone == 0) break;
309    }
310 
311    /*--
312       Reconstruct the original block in
313       eclass8 [0 .. nblock-1], since the
314       previous phase destroyed it.
315    --*/
316    if (verb >= 4)
317       VPrintf0 ( "        reconstructing block ...\n" );
318    j = 0;
319    for (i = 0; i < nblock; i++) {
320       while (ftabCopy[j] == 0) j++;
321       ftabCopy[j]--;
322       eclass8[fmap[i]] = (UChar)j;
323    }
324    AssertH ( j < 256, 1005 );
325 }
326 
327 #undef       SET_BH
328 #undef     CLEAR_BH
329 #undef     ISSET_BH
330 #undef      WORD_BH
331 #undef UNALIGNED_BH
332 
333 /*---------------------------------------------*/
334 /*--- The main, O(N^2 log(N)) sorting       ---*/
335 /*--- algorithm.  Faster for "normal"       ---*/
336 /*--- non-repetitive blocks.                ---*/
337 /*---------------------------------------------*/
338 
339 /*---------------------------------------------*/
340 static
341 __inline__
mainGtU(UInt32 i1,UInt32 i2,UChar * block,UInt16 * quadrant,UInt32 nblock,Int32 * budget)342 Bool mainGtU ( UInt32  i1,
343                UInt32  i2,
344                UChar*  block,
345                UInt16* quadrant,
346                UInt32  nblock,
347                Int32*  budget )
348 {
349    Int32  k;
350    UChar  c1, c2;
351    UInt16 s1, s2;
352 
353    AssertD ( i1 != i2, "mainGtU" );
354    /* 1 */
355    c1 = block[i1]; c2 = block[i2];
356    if (c1 != c2) return (c1 > c2);
357    i1++; i2++;
358    /* 2 */
359    c1 = block[i1]; c2 = block[i2];
360    if (c1 != c2) return (c1 > c2);
361    i1++; i2++;
362    /* 3 */
363    c1 = block[i1]; c2 = block[i2];
364    if (c1 != c2) return (c1 > c2);
365    i1++; i2++;
366    /* 4 */
367    c1 = block[i1]; c2 = block[i2];
368    if (c1 != c2) return (c1 > c2);
369    i1++; i2++;
370    /* 5 */
371    c1 = block[i1]; c2 = block[i2];
372    if (c1 != c2) return (c1 > c2);
373    i1++; i2++;
374    /* 6 */
375    c1 = block[i1]; c2 = block[i2];
376    if (c1 != c2) return (c1 > c2);
377    i1++; i2++;
378    /* 7 */
379    c1 = block[i1]; c2 = block[i2];
380    if (c1 != c2) return (c1 > c2);
381    i1++; i2++;
382    /* 8 */
383    c1 = block[i1]; c2 = block[i2];
384    if (c1 != c2) return (c1 > c2);
385    i1++; i2++;
386    /* 9 */
387    c1 = block[i1]; c2 = block[i2];
388    if (c1 != c2) return (c1 > c2);
389    i1++; i2++;
390    /* 10 */
391    c1 = block[i1]; c2 = block[i2];
392    if (c1 != c2) return (c1 > c2);
393    i1++; i2++;
394    /* 11 */
395    c1 = block[i1]; c2 = block[i2];
396    if (c1 != c2) return (c1 > c2);
397    i1++; i2++;
398    /* 12 */
399    c1 = block[i1]; c2 = block[i2];
400    if (c1 != c2) return (c1 > c2);
401    i1++; i2++;
402 
403    k = nblock + 8;
404 
405    do {
406       /* 1 */
407       c1 = block[i1]; c2 = block[i2];
408       if (c1 != c2) return (c1 > c2);
409       s1 = quadrant[i1]; s2 = quadrant[i2];
410       if (s1 != s2) return (s1 > s2);
411       i1++; i2++;
412       /* 2 */
413       c1 = block[i1]; c2 = block[i2];
414       if (c1 != c2) return (c1 > c2);
415       s1 = quadrant[i1]; s2 = quadrant[i2];
416       if (s1 != s2) return (s1 > s2);
417       i1++; i2++;
418       /* 3 */
419       c1 = block[i1]; c2 = block[i2];
420       if (c1 != c2) return (c1 > c2);
421       s1 = quadrant[i1]; s2 = quadrant[i2];
422       if (s1 != s2) return (s1 > s2);
423       i1++; i2++;
424       /* 4 */
425       c1 = block[i1]; c2 = block[i2];
426       if (c1 != c2) return (c1 > c2);
427       s1 = quadrant[i1]; s2 = quadrant[i2];
428       if (s1 != s2) return (s1 > s2);
429       i1++; i2++;
430       /* 5 */
431       c1 = block[i1]; c2 = block[i2];
432       if (c1 != c2) return (c1 > c2);
433       s1 = quadrant[i1]; s2 = quadrant[i2];
434       if (s1 != s2) return (s1 > s2);
435       i1++; i2++;
436       /* 6 */
437       c1 = block[i1]; c2 = block[i2];
438       if (c1 != c2) return (c1 > c2);
439       s1 = quadrant[i1]; s2 = quadrant[i2];
440       if (s1 != s2) return (s1 > s2);
441       i1++; i2++;
442       /* 7 */
443       c1 = block[i1]; c2 = block[i2];
444       if (c1 != c2) return (c1 > c2);
445       s1 = quadrant[i1]; s2 = quadrant[i2];
446       if (s1 != s2) return (s1 > s2);
447       i1++; i2++;
448       /* 8 */
449       c1 = block[i1]; c2 = block[i2];
450       if (c1 != c2) return (c1 > c2);
451       s1 = quadrant[i1]; s2 = quadrant[i2];
452       if (s1 != s2) return (s1 > s2);
453       i1++; i2++;
454 
455       if (i1 >= nblock) i1 -= nblock;
456       if (i2 >= nblock) i2 -= nblock;
457 
458       k -= 8;
459       (*budget)--;
460    }
461       while (k >= 0);
462 
463    return False;
464 }
465 
466 /*---------------------------------------------*/
467 /*--
468    Knuth's increments seem to work better
469    than Incerpi-Sedgewick here.  Possibly
470    because the number of elems to sort is
471    usually small, typically <= 20.
472 --*/
473 static
474 Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
475                    9841, 29524, 88573, 265720,
476                    797161, 2391484 };
477 
478 static
mainSimpleSort(UInt32 * ptr,UChar * block,UInt16 * quadrant,Int32 nblock,Int32 lo,Int32 hi,Int32 d,Int32 * budget)479 void mainSimpleSort ( UInt32* ptr,
480                       UChar*  block,
481                       UInt16* quadrant,
482                       Int32   nblock,
483                       Int32   lo,
484                       Int32   hi,
485                       Int32   d,
486                       Int32*  budget )
487 {
488    Int32 i, j, h, bigN, hp;
489    UInt32 v;
490 
491    bigN = hi - lo + 1;
492    if (bigN < 2) return;
493 
494    hp = 0;
495    while (incs[hp] < bigN) hp++;
496    hp--;
497 
498    for (; hp >= 0; hp--) {
499       h = incs[hp];
500 
501       i = lo + h;
502       while (True) {
503 
504          /*-- copy 1 --*/
505          if (i > hi) break;
506          v = ptr[i];
507          j = i;
508          while ( mainGtU (
509                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
510                  ) ) {
511             ptr[j] = ptr[j-h];
512             j = j - h;
513             if (j <= (lo + h - 1)) break;
514          }
515          ptr[j] = v;
516          i++;
517 
518          /*-- copy 2 --*/
519          if (i > hi) break;
520          v = ptr[i];
521          j = i;
522          while ( mainGtU (
523                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
524                  ) ) {
525             ptr[j] = ptr[j-h];
526             j = j - h;
527             if (j <= (lo + h - 1)) break;
528          }
529          ptr[j] = v;
530          i++;
531 
532          /*-- copy 3 --*/
533          if (i > hi) break;
534          v = ptr[i];
535          j = i;
536          while ( mainGtU (
537                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
538                  ) ) {
539             ptr[j] = ptr[j-h];
540             j = j - h;
541             if (j <= (lo + h - 1)) break;
542          }
543          ptr[j] = v;
544          i++;
545 
546          if (*budget < 0) return;
547       }
548    }
549 }
550 
551 /*---------------------------------------------*/
552 /*--
553    The following is an implementation of
554    an elegant 3-way quicksort for strings,
555    described in a paper "Fast Algorithms for
556    Sorting and Searching Strings", by Robert
557    Sedgewick and Jon L. Bentley.
558 --*/
559 
560 #define mswap(zz1, zz2) \
561    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
562 
563 #define mvswap(zzp1, zzp2, zzn)       \
564 {                                     \
565    Int32 yyp1 = (zzp1);               \
566    Int32 yyp2 = (zzp2);               \
567    Int32 yyn  = (zzn);                \
568    while (yyn > 0) {                  \
569       mswap(ptr[yyp1], ptr[yyp2]);    \
570       yyp1++; yyp2++; yyn--;          \
571    }                                  \
572 }
573 
574 static
575 __inline__
mmed3(UChar a,UChar b,UChar c)576 UChar mmed3 ( UChar a, UChar b, UChar c )
577 {
578    UChar t;
579    if (a > b) { t = a; a = b; b = t; };
580    if (b > c) {
581       b = c;
582       if (a > b) b = a;
583    }
584    return b;
585 }
586 
587 #define mmin(a,b) ((a) < (b)) ? (a) : (b)
588 
589 #define mpush(lz,hz,dz) { stackLo[sp] = lz; \
590                           stackHi[sp] = hz; \
591                           stackD [sp] = dz; \
592                           sp++; }
593 
594 #define mpop(lz,hz,dz) { sp--;             \
595                          lz = stackLo[sp]; \
596                          hz = stackHi[sp]; \
597                          dz = stackD [sp]; }
598 
599 
600 #define mnextsize(az) (nextHi[az]-nextLo[az])
601 
602 #define mnextswap(az,bz)                                        \
603    { Int32 tz;                                                  \
604      tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
605      tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
606      tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
607 
608 
609 #define MAIN_QSORT_SMALL_THRESH 20
610 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
611 #define MAIN_QSORT_STACK_SIZE 100
612 
613 static
mainQSort3(UInt32 * ptr,UChar * block,UInt16 * quadrant,Int32 nblock,Int32 loSt,Int32 hiSt,Int32 dSt,Int32 * budget)614 void mainQSort3 ( UInt32* ptr,
615                   UChar*  block,
616                   UInt16* quadrant,
617                   Int32   nblock,
618                   Int32   loSt,
619                   Int32   hiSt,
620                   Int32   dSt,
621                   Int32*  budget )
622 {
623    Int32 unLo, unHi, ltLo, gtHi, n, m, med;
624    Int32 sp, lo, hi, d;
625 
626    Int32 stackLo[MAIN_QSORT_STACK_SIZE];
627    Int32 stackHi[MAIN_QSORT_STACK_SIZE];
628    Int32 stackD [MAIN_QSORT_STACK_SIZE];
629 
630    Int32 nextLo[3];
631    Int32 nextHi[3];
632    Int32 nextD [3];
633 
634    sp = 0;
635    mpush ( loSt, hiSt, dSt );
636 
637    while (sp > 0) {
638 
639       AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 );
640 
641       mpop ( lo, hi, d );
642       if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
643           d > MAIN_QSORT_DEPTH_THRESH) {
644          mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
645          if (*budget < 0) return;
646          continue;
647       }
648 
649       med = (Int32)
650             mmed3 ( block[ptr[ lo         ]+d],
651                     block[ptr[ hi         ]+d],
652                     block[ptr[ (lo+hi)>>1 ]+d] );
653 
654       unLo = ltLo = lo;
655       unHi = gtHi = hi;
656 
657       while (True) {
658          while (True) {
659             if (unLo > unHi) break;
660             n = ((Int32)block[ptr[unLo]+d]) - med;
661             if (n == 0) {
662                mswap(ptr[unLo], ptr[ltLo]);
663                ltLo++; unLo++; continue;
664             };
665             if (n >  0) break;
666             unLo++;
667          }
668          while (True) {
669             if (unLo > unHi) break;
670             n = ((Int32)block[ptr[unHi]+d]) - med;
671             if (n == 0) {
672                mswap(ptr[unHi], ptr[gtHi]);
673                gtHi--; unHi--; continue;
674             };
675             if (n <  0) break;
676             unHi--;
677          }
678          if (unLo > unHi) break;
679          mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
680       }
681 
682       AssertD ( unHi == unLo-1, "mainQSort3(2)" );
683 
684       if (gtHi < ltLo) {
685          mpush(lo, hi, d+1 );
686          continue;
687       }
688 
689       n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
690       m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
691 
692       n = lo + unLo - ltLo - 1;
693       m = hi - (gtHi - unHi) + 1;
694 
695       nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
696       nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
697       nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
698 
699       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
700       if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
701       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
702 
703       AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
704       AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
705 
706       mpush (nextLo[0], nextHi[0], nextD[0]);
707       mpush (nextLo[1], nextHi[1], nextD[1]);
708       mpush (nextLo[2], nextHi[2], nextD[2]);
709    }
710 }
711 
712 #undef mswap
713 #undef mvswap
714 #undef mpush
715 #undef mpop
716 #undef mmin
717 #undef mnextsize
718 #undef mnextswap
719 #undef MAIN_QSORT_SMALL_THRESH
720 #undef MAIN_QSORT_DEPTH_THRESH
721 #undef MAIN_QSORT_STACK_SIZE
722 
723 /*---------------------------------------------*/
724 /* Pre:
725       nblock > N_OVERSHOOT
726       block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
727       ((UChar*)block32) [0 .. nblock-1] holds block
728       ptr exists for [0 .. nblock-1]
729 
730    Post:
731       ((UChar*)block32) [0 .. nblock-1] holds block
732       All other areas of block32 destroyed
733       ftab [0 .. 65536 ] destroyed
734       ptr [0 .. nblock-1] holds sorted order
735       if (*budget < 0), sorting was abandoned
736 */
737 
738 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
739 #define SETMASK (1 << 21)
740 #define CLEARMASK (~(SETMASK))
741 
742 static
mainSort(UInt32 * ptr,UChar * block,UInt16 * quadrant,UInt32 * ftab,Int32 nblock,Int32 verb,Int32 * budget)743 void mainSort ( UInt32* ptr,
744                 UChar*  block,
745                 UInt16* quadrant,
746                 UInt32* ftab,
747                 Int32   nblock,
748                 Int32   verb,
749                 Int32*  budget )
750 {
751    Int32  i, j, k, ss, sb;
752    Int32  runningOrder[256];
753    Bool   bigDone[256];
754    Int32  copyStart[256];
755    Int32  copyEnd  [256];
756    UChar  c1;
757    Int32  numQSorted;
758    UInt16 s;
759    if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
760 
761    /*-- set up the 2-byte frequency table --*/
762    for (i = 65536; i >= 0; i--) ftab[i] = 0;
763 
764    j = block[0] << 8;
765    i = nblock-1;
766    for (; i >= 3; i -= 4) {
767       quadrant[i] = 0;
768       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
769       ftab[j]++;
770       quadrant[i-1] = 0;
771       j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
772       ftab[j]++;
773       quadrant[i-2] = 0;
774       j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
775       ftab[j]++;
776       quadrant[i-3] = 0;
777       j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
778       ftab[j]++;
779    }
780    for (; i >= 0; i--) {
781       quadrant[i] = 0;
782       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
783       ftab[j]++;
784    }
785 
786    /*-- (emphasises close relationship of block & quadrant) --*/
787    for (i = 0; i < BZ_N_OVERSHOOT; i++) {
788       block   [nblock+i] = block[i];
789       quadrant[nblock+i] = 0;
790    }
791 
792    if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
793 
794    /*-- Complete the initial radix sort --*/
795    for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
796 
797    s = block[0] << 8;
798    i = nblock-1;
799    for (; i >= 3; i -= 4) {
800       s = (s >> 8) | (block[i] << 8);
801       j = ftab[s] -1;
802       ftab[s] = j;
803       ptr[j] = i;
804       s = (s >> 8) | (block[i-1] << 8);
805       j = ftab[s] -1;
806       ftab[s] = j;
807       ptr[j] = i-1;
808       s = (s >> 8) | (block[i-2] << 8);
809       j = ftab[s] -1;
810       ftab[s] = j;
811       ptr[j] = i-2;
812       s = (s >> 8) | (block[i-3] << 8);
813       j = ftab[s] -1;
814       ftab[s] = j;
815       ptr[j] = i-3;
816    }
817    for (; i >= 0; i--) {
818       s = (s >> 8) | (block[i] << 8);
819       j = ftab[s] -1;
820       ftab[s] = j;
821       ptr[j] = i;
822    }
823 
824    /*--
825       Now ftab contains the first loc of every small bucket.
826       Calculate the running order, from smallest to largest
827       big bucket.
828    --*/
829    for (i = 0; i <= 255; i++) {
830       bigDone     [i] = False;
831       runningOrder[i] = i;
832    }
833 
834    {
835       Int32 vv;
836       Int32 h = 1;
837       do h = 3 * h + 1; while (h <= 256);
838       do {
839          h = h / 3;
840          for (i = h; i <= 255; i++) {
841             vv = runningOrder[i];
842             j = i;
843             while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
844                runningOrder[j] = runningOrder[j-h];
845                j = j - h;
846                if (j <= (h - 1)) goto zero;
847             }
848             zero:
849             runningOrder[j] = vv;
850          }
851       } while (h != 1);
852    }
853 
854    /*--
855       The main sorting loop.
856    --*/
857 
858    numQSorted = 0;
859 
860    for (i = 0; i <= 255; i++) {
861 
862       /*--
863          Process big buckets, starting with the least full.
864          Basically this is a 3-step process in which we call
865          mainQSort3 to sort the small buckets [ss, j], but
866          also make a big effort to avoid the calls if we can.
867       --*/
868       ss = runningOrder[i];
869 
870       /*--
871          Step 1:
872          Complete the big bucket [ss] by quicksorting
873          any unsorted small buckets [ss, j], for j != ss.
874          Hopefully previous pointer-scanning phases have already
875          completed many of the small buckets [ss, j], so
876          we don't have to sort them at all.
877       --*/
878       for (j = 0; j <= 255; j++) {
879          if (j != ss) {
880             sb = (ss << 8) + j;
881             if ( ! (ftab[sb] & SETMASK) ) {
882                Int32 lo = ftab[sb]   & CLEARMASK;
883                Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
884                if (hi > lo) {
885                   if (verb >= 4)
886                      VPrintf4 ( "        qsort [0x%x, 0x%x]   "
887                                 "done %d   this %d\n",
888                                 ss, j, numQSorted, hi - lo + 1 );
889                   mainQSort3 (
890                      ptr, block, quadrant, nblock,
891                      lo, hi, BZ_N_RADIX, budget
892                   );
893                   numQSorted += (hi - lo + 1);
894                   if (*budget < 0) return;
895                }
896             }
897             ftab[sb] |= SETMASK;
898          }
899       }
900 
901       AssertH ( !bigDone[ss], 1006 );
902 
903       /*--
904          Step 2:
905          Now scan this big bucket [ss] so as to synthesise the
906          sorted order for small buckets [t, ss] for all t,
907          including, magically, the bucket [ss,ss] too.
908          This will avoid doing Real Work in subsequent Step 1's.
909       --*/
910       {
911          for (j = 0; j <= 255; j++) {
912             copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
913             copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
914          }
915          for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
916             k = ptr[j]-1; if (k < 0) k += nblock;
917             c1 = block[k];
918             if (!bigDone[c1])
919                ptr[ copyStart[c1]++ ] = k;
920          }
921          for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
922             k = ptr[j]-1; if (k < 0) k += nblock;
923             c1 = block[k];
924             if (!bigDone[c1])
925                ptr[ copyEnd[c1]-- ] = k;
926          }
927       }
928 
929       AssertH ( (copyStart[ss]-1 == copyEnd[ss])
930                 ||
931                 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
932                    Necessity for this case is demonstrated by compressing
933                    a sequence of approximately 48.5 million of character
934                    251; 1.0.0/1.0.1 will then die here. */
935                 (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
936                 1007 )
937 
938       for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
939 
940       /*--
941          Step 3:
942          The [ss] big bucket is now done.  Record this fact,
943          and update the quadrant descriptors.  Remember to
944          update quadrants in the overshoot area too, if
945          necessary.  The "if (i < 255)" test merely skips
946          this updating for the last bucket processed, since
947          updating for the last bucket is pointless.
948 
949          The quadrant array provides a way to incrementally
950          cache sort orderings, as they appear, so as to
951          make subsequent comparisons in fullGtU() complete
952          faster.  For repetitive blocks this makes a big
953          difference (but not big enough to be able to avoid
954          the fallback sorting mechanism, exponential radix sort).
955 
956          The precise meaning is: at all times:
957 
958             for 0 <= i < nblock and 0 <= j <= nblock
959 
960             if block[i] != block[j],
961 
962                then the relative values of quadrant[i] and
963                     quadrant[j] are meaningless.
964 
965                else {
966                   if quadrant[i] < quadrant[j]
967                      then the string starting at i lexicographically
968                      precedes the string starting at j
969 
970                   else if quadrant[i] > quadrant[j]
971                      then the string starting at j lexicographically
972                      precedes the string starting at i
973 
974                   else
975                      the relative ordering of the strings starting
976                      at i and j has not yet been determined.
977                }
978       --*/
979       bigDone[ss] = True;
980 
981       if (i < 255) {
982          Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
983          Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
984          Int32 shifts   = 0;
985 
986          while ((bbSize >> shifts) > 65534) shifts++;
987 
988          for (j = bbSize-1; j >= 0; j--) {
989             Int32 a2update     = ptr[bbStart + j];
990             UInt16 qVal        = (UInt16)(j >> shifts);
991             quadrant[a2update] = qVal;
992             if (a2update < BZ_N_OVERSHOOT)
993                quadrant[a2update + nblock] = qVal;
994          }
995          AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
996       }
997 
998    }
999 
1000    if (verb >= 4)
1001       VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
1002                  nblock, numQSorted, nblock - numQSorted );
1003 }
1004 
1005 #undef BIGFREQ
1006 #undef SETMASK
1007 #undef CLEARMASK
1008 
1009 /*---------------------------------------------*/
1010 /* Pre:
1011       nblock > 0
1012       arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1013       ((UChar*)arr2)  [0 .. nblock-1] holds block
1014       arr1 exists for [0 .. nblock-1]
1015 
1016    Post:
1017       ((UChar*)arr2) [0 .. nblock-1] holds block
1018       All other areas of block destroyed
1019       ftab [ 0 .. 65536 ] destroyed
1020       arr1 [0 .. nblock-1] holds sorted order
1021 */
BZ2_blockSort(EState * s)1022 void BZ2_blockSort ( EState* s )
1023 {
1024    UInt32* ptr    = s->ptr;
1025    UChar*  block  = s->block;
1026    UInt32* ftab   = s->ftab;
1027    Int32   nblock = s->nblock;
1028    Int32   verb   = s->verbosity;
1029    Int32   wfact  = s->workFactor;
1030    UInt16* quadrant;
1031    Int32   budget;
1032    Int32   budgetInit;
1033    Int32   i;
1034 
1035    if (nblock < 10000) {
1036       fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1037    } else {
1038       /* Calculate the location for quadrant, remembering to get
1039          the alignment right.  Assumes that &(block[0]) is at least
1040          2-byte aligned -- this should be ok since block is really
1041          the first section of arr2.
1042       */
1043       i = nblock+BZ_N_OVERSHOOT;
1044       if (i & 1) i++;
1045       quadrant = (UInt16*)(&(block[i]));
1046 
1047       /* (wfact-1) / 3 puts the default-factor-30
1048          transition point at very roughly the same place as
1049          with v0.1 and v0.9.0.
1050          Not that it particularly matters any more, since the
1051          resulting compressed stream is now the same regardless
1052          of whether or not we use the main sort or fallback sort.
1053       */
1054       if (wfact < 1  ) wfact = 1;
1055       if (wfact > 100) wfact = 100;
1056       budgetInit = nblock * ((wfact-1) / 3);
1057       budget = budgetInit;
1058 
1059       mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1060       if (verb >= 3)
1061          VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
1062                     budgetInit - budget,
1063                     nblock,
1064                     (float)(budgetInit - budget) /
1065                     (float)(nblock==0 ? 1 : nblock) );
1066       if (budget < 0) {
1067          if (verb >= 2)
1068             VPrintf0 ( "    too repetitive; using fallback"
1069                        " sorting algorithm\n" );
1070          fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1071       }
1072    }
1073 
1074    s->origPtr = -1;
1075    for (i = 0; i < s->nblock; i++)
1076       if (ptr[i] == 0)
1077          { s->origPtr = i; break; };
1078 
1079    AssertH( s->origPtr != -1, 1003 );
1080 }
1081 
1082 /*-------------------------------------------------------------*/
1083 /*--- end                                       blocksort.c ---*/
1084 /*-------------------------------------------------------------*/
1085