1 // bwt.cpp - BWT transform (does not compress)
2 // (C) 2006-2009, Matt Mahoney,
3 // This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
4 
5 // Derived from BBB, without the compression part. Commands are the same.
6 
7 
8 #include <cstdio>
9 #include <cstdlib>
10 #include <cstring>
11 #include <ctime>
12 #include <algorithm>
13 #define NDEBUG  // remove for debugging
14 #include <cassert>
15 using namespace std;
16 
17 // 8, 16, 32 bit unsigned types
18 typedef unsigned char U8;
19 typedef unsigned short U16;
20 typedef unsigned int U32;
21 
22 //////////////////////////// Array ////////////////////////////
23 
24 // Array<T, ALIGN> a(n); creates n elements of T initialized to 0 bits.
25 // Constructors for T are not called.
26 // Indexing is bounds checked if assertions are on.
27 // a.size() returns n.
28 // a.resize(n) changes size to n, padding with 0 bits or truncating.
29 // Copy and assignment are not supported.
30 // Memory is aligned on a ALIGN byte boundary (power of 2), default is none.
31 
32 template <class T, int ALIGN=0> class Array {
33 private:
34   int n;     // user size
35   int reserved;  // actual size
36   char *ptr; // allocated memory, zeroed
37   T* data;   // start of n elements of aligned data
38   void create(int i);  // create with size i
39 public:
Array(int i=0)40   explicit Array(int i=0) {create(i);}
41   ~Array();
operator [](int i)42   T& operator[](int i) {
43 #ifndef NDEBUG
44     if (i<0 || i>=n) fprintf(stderr, "%d out of bounds %d\n", i, n), exit(1);
45 #endif
46     return data[i];
47   }
operator [](int i) const48   const T& operator[](int i) const {
49 #ifndef NDEBUG
50     if (i<0 || i>=n) fprintf(stderr, "%d out of bounds %d\n", i, n), exit(1);
51 #endif
52     return data[i];
53   }
size() const54   int size() const {return n;}
55   void resize(int i);  // change size to i
56 private:
57   Array(const Array&);  // no copy or assignment
58   Array& operator=(const Array&);
59 };
60 
resize(int i)61 template<class T, int ALIGN> void Array<T, ALIGN>::resize(int i) {
62   if (i<=reserved) {
63     n=i;
64     return;
65   }
66   char *saveptr=ptr;
67   T *savedata=data;
68   int saven=n;
69   create(i);
70   if (savedata && saveptr) {
71     memcpy(data, savedata, sizeof(T)*min(i, saven));
72     free(saveptr);
73   }
74 }
75 
create(int i)76 template<class T, int ALIGN> void Array<T, ALIGN>::create(int i) {
77   n=reserved=i;
78   if (i<=0) {
79     data=0;
80     ptr=0;
81     return;
82   }
83   const int sz=ALIGN+n*sizeof(T);
84   ptr = (char*)calloc(sz, 1);
85   if (!ptr) fprintf(stderr, "Out of memory\n"), exit(1);
86   data = (ALIGN ? (T*)(ptr+ALIGN-(((long)ptr)&(ALIGN-1))) : (T*)ptr);
87   assert((char*)data>=ptr && (char*)data<=ptr+ALIGN);
88 }
89 
~Array()90 template<class T, int ALIGN> Array<T, ALIGN>::~Array() {
91   free(ptr);
92 }
93 
94 ///////////////////////////// Encoder ///////////////////////////
95 
96 typedef enum {COMPRESS, DECOMPRESS} Mode;
97 class Encoder {
98   Mode mode;
99   FILE *archive;
100 public:
Encoder(Mode m,FILE * f)101   Encoder(Mode m, FILE* f): mode(m), archive(f) {}
getMode() const102   Mode getMode() const {return mode;}
size() const103   long size() const {return ftell(archive);}  // length of archive so far
flush()104   void flush() {}  // call this when compression is finished
105 
106   // Compress one byte
compress(int c)107   void compress(int c) {
108     assert(mode==COMPRESS);
109     putc(c, archive);
110   }
111 
112   // Decompress and return one byte
decompress()113   int decompress() {
114     assert(mode==DECOMPRESS);
115     return getc(archive);
116   }
117 };
118 
119 ///////////////////////////////// BWT //////////////////////////////
120 
121 // Globals
122 bool fast=false;  // transform method: fast uses 5x blocksize memory, slow uses 5x/4
123 int blockSize=0x400000;  // max BWT block size
124 int n=0;          // number of elements in block, 0 < n <= blockSize
125 Array<U8> block;  // [n] text to transform
126 Array<int> ptr;   // [n] or [n/16] indexes into block to sort
127 const int PAD=72; // extra bytes in block (copy of beginning)
128 int pos=0;        // bytes compressed/decompressed so far
129 bool quiet=false; // q option?
130 
131 // true if block[a+1...] < block[b+1...] wrapping at n
lessthan(int a,int b)132 inline bool lessthan(int a, int b) {
133   if (a<0) return false;
134   if (b<0) return true;
135   int r=block[a+1]-block[b+1];  // an optimization
136   if (r) return r<0;
137   r=memcmp(&block[a+2], &block[b+2], PAD-8);
138   if (r) return r<0;
139   if (a<b) {
140     int r=memcmp(&block[a+1], &block[b+1], n-b-1);
141     if (r) return r<0;
142     r=memcmp(&block[a+n-b], &block[0], b-a);
143     if (r) return r<0;
144     return memcmp(&block[0], &block[b-a], a)<0;
145   }
146   else {
147     int r=memcmp(&block[a+1], &block[b+1], n-a-1);
148     if (r) return r<0;
149     r=memcmp(&block[0], &block[b+n-a], a-b);
150     if (r) return r<0;
151     return memcmp(&block[a-b], &block[0], b)<0;
152   }
153 
154 }
155 
156 // read 4 byte value LSB first, or -1 at EOF
read4(FILE * f)157 int read4(FILE* f) {
158   unsigned int r=getc(f);
159   r|=getc(f)<<8;
160   r|=getc(f)<<16;
161   r|=getc(f)<<24;
162   return r;
163 }
164 
165 // read n<=blockSize bytes from in to block, BWT, write to out
encodeBlock(FILE * in,Encoder & en)166 int encodeBlock(FILE* in, Encoder& en) {
167   n=fread(&block[0], 1, blockSize, in);  // n = actual block size
168   if (n<1) return 0;
169   assert(block.size()>=n+PAD);
170   for (int i=0; i<PAD; ++i) block[i+n]=block[i];
171 
172   // fast mode: sort the pointers to the block
173   if (fast) {
174     if (!quiet) printf("sorting     %10d to %10d  \r", pos, pos+n);
175     assert(ptr.size()>=n);
176     for (int i=0; i<n; ++i) ptr[i]=i;
177     stable_sort(&ptr[0], &ptr[n], lessthan);  // faster than sort() or qsort()
178     int p=min_element(&ptr[0], &ptr[n])-&ptr[0];
179     en.compress(n>>24);
180     en.compress(n>>16);
181     en.compress(n>>8);
182     en.compress(n);
183     en.compress(p>>24);
184     en.compress(p>>16);
185     en.compress(p>>8);
186     en.compress(p);
187     if (!quiet) printf("compressing %10d to %10d  \r", pos, pos+n);
188     for (int i=0; i<n; ++i) {
189       en.compress(block[ptr[i]]);
190       if (!quiet && i && (i&0xffff)==0)
191         printf("compressed  %10d of %10d  \r", pos+i, pos+n);
192     }
193     pos+=n;
194     return n;
195   }
196 
197   // slow mode: divide the block into 16 parts, sort them, write the pointers
198   // to temporary files, then merge them.
199   else {
200 
201     // write header
202     if (!quiet) printf("writing header at %10d          \r", pos);
203     int p=0;
204     for (int i=1; i<n; ++i)
205       if (lessthan(i, 0)) ++p;
206     en.compress(n>>24);
207     en.compress(n>>16);
208     en.compress(n>>8);
209     en.compress(n);
210     en.compress(p>>24);
211     en.compress(p>>16);
212     en.compress(p>>8);
213     en.compress(p);
214 
215     // sort pointers in 16 parts to temporary files
216     const int subBlockSize = (n-1)/16+1;  // max size of sub-block
217     int start=0, end=subBlockSize;  // range of current sub-block
218     FILE* tmp[16];  // temporary files
219     for (int i=0; i<16; ++i) {
220       if (!quiet) printf("sorting      %10d to %10d  \r", pos+start, pos+end);
221       tmp[i]=tmpfile();
222       if (!tmp[i]) perror("tmpfile()"), exit(1);
223       for (int j=start; j<end; ++j) ptr[j-start]=j;
224       stable_sort(&ptr[0], &ptr[end-start], lessthan);
225       for (int j=start; j<end; ++j) {  // write pointers
226         int c=ptr[j-start];
227         fprintf(tmp[i], "%c%c%c%c", c, c>>8, c>>16, c>>24);
228       }
229       start=end;
230       end+=subBlockSize;
231       if (end>n) end=n;
232     }
233 
234     // merge sorted pointers
235     if (!quiet) printf("merging      %10d to %10d  \r", pos, pos+n);
236     unsigned int t[16];  // current pointers
237     for (int i=0; i<16; ++i) {  // init t
238       rewind(tmp[i]);
239       t[i]=read4(tmp[i]);
240     }
241     for (int i=0; i<n; ++i) {  // merge and compress
242       int j=min_element(t, t+16, lessthan)-t;
243       en.compress(block[t[j]]);
244       if (!quiet && i && (i&0xffff)==0)
245         printf("compressed  %10d of %10d  \r", pos+i, pos+n);
246       t[j]=read4(tmp[j]);
247     }
248     for (int i=0; i<16; ++i)  // delete tmp files
249       fclose(tmp[i]);
250     pos+=n;
251     return n;
252   }
253 }
254 
255 // forward BWT
encode(FILE * in,Encoder & en)256 void encode(FILE* in, Encoder& en) {
257   block.resize(blockSize+PAD);
258   if (fast) ptr.resize(blockSize+1);
259   else ptr.resize((blockSize-1)/16+2);
260   while (encodeBlock(in, en));
261 /*  en.compress(0);  // mark EOF
262   en.compress(0);
263   en.compress(0);
264   en.compress(0);*/
265 }
266 
267 // inverse BWT of one block
decodeBlock(Encoder & en,FILE * out)268 int decodeBlock(Encoder& en, FILE* out) {
269 
270   // read block size
271   int n=en.decompress();
272   n=n*256+en.decompress();
273   n=n*256+en.decompress();
274   n=n*256+en.decompress();
275   if (n==0) return n;
276   if (!blockSize) {  // first block?  allocate memory
277     blockSize = n;
278     if (!quiet) printf("block size = %d\n", blockSize);
279     block.resize(blockSize+PAD);
280     if (fast) ptr.resize(blockSize);
281     else ptr.resize(blockSize/16+256);
282   }
283   else if (n<1 || n>blockSize) {
284     printf("file corrupted: block=%d max=%d\n", n, blockSize);
285     exit(1);
286   }
287 
288   // read pointer to first byte
289   int p=en.decompress();
290   p=p*256+en.decompress();
291   p=p*256+en.decompress();
292   p=p*256+en.decompress();
293   if (p<0 || p>=n) {
294     printf("file corrupted: p=%d n=%d\n", p, n);
295     exit(1);
296   }
297 
298   // decompress and read block
299   for (int i=0; i<n; ++i) {
300     block[i]=en.decompress();
301     if (!quiet && i && (i&0xffff)==0)
302       printf("decompressed %10d of %10d  \r", pos+i, pos+n);
303   }
304   for (int i=0; i<PAD; ++i) block[i+n]=block[i];  // circular pad
305 
306   // count (sort) bytes
307   if (!quiet) printf("unsorting    %10d to %10d  \r", pos, pos+n);
308   Array<int> t(257);  // i -> number of bytes < i in block
309   for (int i=0; i<n; ++i)
310     ++t[block[i]+1];
311   for (int i=1; i<257; ++i)
312     t[i]+=t[i-1];
313   assert(t[256]==n);
314 
315   // fast mode: build linked list
316   if (fast) {
317     for (int i=0; i<n; ++i)
318       ptr[t[block[i]]++]=i;
319     assert(t[255]==n);
320 
321     // traverse list
322     for (int i=0; i<n; ++i) {
323       assert(p>=0 && p<n);
324       putc(block[p], out);
325       p=ptr[p];
326     }
327     return n;
328   }
329 
330   // slow: build ptr[t[c]+c+i] = position of i*16'th occurrence of c in block
331   Array<int> count(256);  // i -> count of i in block
332   for (int i=0; i<n; ++i) {
333     int c=block[i];
334     if ((count[c]++ & 15)==0)
335       ptr[(t[c]>>4)+c+(count[c]>>4)]=i;
336   }
337 
338   // decode
339   int c=block[p];
340   for (int i=0; i<n; ++i) {
341     assert(p>=0 && p<n);
342     putc(c, out);
343 
344     // find next c by binary search in t so that t[c] <= p < t[c+1]
345     c=127;
346     int d=64;
347     while (d) {
348       if (t[c]>p) c-=d;
349       else if (t[c+1]<=p) c+=d;
350       else break;
351       d>>=1;
352     }
353     if (c==254 && t[255]<=p && p<t[256]) c=255;
354     assert(c>=0 && c<256 && t[c]<=p && p<t[c+1]);
355 
356     // find approximate position of p
357     int offset=p-t[c];
358     const U8* q=&block[ptr[(t[c]>>4)+c+(offset>>4)]];  // start of linear search
359     offset&=15;
360 
361     // find next p by linear search for offset'th occurrence of c in block
362     while (offset--)
363       if (*++q != c) q=(const U8*)memchr(q, c, &block[n]-q);
364     assert(q && q>=&block[0] && q<&block[n]);
365     p=q-&block[0];
366   }
367   pos+=n;
368   return n;
369 }
370 
371 // inverse BWT of file
decode(Encoder & en,FILE * out)372 void decode(Encoder& en, FILE* out) {
373   while (decodeBlock(en, out));
374 }
375 
376 /////////////////////////////// main ////////////////////////////
377 
main(int argc,char ** argv)378 int main(int argc, char** argv) {
379   clock_t start=clock();
380 
381   // check for args
382   if (argc<4) {
383     printf("bwt Big Block BWT file encoder, ver. 1\n"
384       "(C) 2009, Matt Mahoney.  Free under GPL, http://www.gnu.org/licenses/gpl.txt\n"
385       "\n"
386       "To encode a file: bbb command input output\n"
387       "\n"
388       "Commands:\n"
389       "c = code (default),  d = decode.\n"
390       "f = fast mode, needs 5x block size memory, default uses 1.25x block size.\n"
391       "q = quiet (no output except error messages).\n"
392       "bN, kN, mN = use block size N bytes, KiB, MiB, default = m4 (compression only).\n"
393       "\n"
394       "Commands should be concatenated in any order, e.g. bwt cfm100q foo foo.bwt\n"
395       "means code foo to foo.bwt in fast mode using 100 MiB block size in quiet\n"
396       "mode.\n");
397     exit(0);
398   }
399 
400   // read options
401   Mode mode=COMPRESS;
402   const char* p=argv[1];
403   while (*p) {
404     switch (*p) {
405       case 'c': mode=COMPRESS; break;
406       case 'd': mode=DECOMPRESS; break;
407       case 'f': fast=true; break;
408       case 'b': blockSize=atoi(p+1); break;
409       case 'k': blockSize=atoi(p+1)<<10; break;
410       case 'm': blockSize=atoi(p+1)<<20; break;
411       case 'q': quiet=true; break;
412     }
413     ++p;
414   }
415   if (blockSize<1) printf("Block size must be at least 1\n"), exit(1);
416 
417   // open files
418   FILE* in=fopen(argv[2], "rb");
419   if (!in) perror(argv[2]), exit(1);
420   FILE* out=fopen(argv[3], "wb");
421   if (!out) perror(argv[3]), exit(1);
422 
423   // encode or decode
424   if (mode==COMPRESS) {
425     if (!quiet) printf("Compressing %s to %s in %s mode, block size = %d\n",
426       argv[2], argv[3], fast ? "fast" : "slow", blockSize);
427     Encoder en(COMPRESS, out);
428     encode(in, en);
429     en.flush();
430   }
431   else if (mode==DECOMPRESS) {
432     blockSize=0;
433     if (!quiet) printf("Decompressing %s to %s in %s mode\n",
434       argv[2], argv[3], fast ? "fast" : "slow");
435     Encoder en(DECOMPRESS, in);
436     decode(en, out);
437   }
438   if (!quiet) printf("%ld -> %ld in %1.2f sec                  \n", ftell(in), ftell(out),
439     (clock()-start+0.0)/CLOCKS_PER_SEC);
440   return 0;
441 }
442 
443