1 // bwt.cpp - BWT transform (does not compress)
2 // (C) 2006-2009, Matt Mahoney,
3 // This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
4
5 // Derived from BBB, without the compression part. Commands are the same.
6
7
8 #include <cstdio>
9 #include <cstdlib>
10 #include <cstring>
11 #include <ctime>
12 #include <algorithm>
13 #define NDEBUG // remove for debugging
14 #include <cassert>
15 using namespace std;
16
17 // 8, 16, 32 bit unsigned types
18 typedef unsigned char U8;
19 typedef unsigned short U16;
20 typedef unsigned int U32;
21
22 //////////////////////////// Array ////////////////////////////
23
24 // Array<T, ALIGN> a(n); creates n elements of T initialized to 0 bits.
25 // Constructors for T are not called.
26 // Indexing is bounds checked if assertions are on.
27 // a.size() returns n.
28 // a.resize(n) changes size to n, padding with 0 bits or truncating.
29 // Copy and assignment are not supported.
30 // Memory is aligned on a ALIGN byte boundary (power of 2), default is none.
31
32 template <class T, int ALIGN=0> class Array {
33 private:
34 int n; // user size
35 int reserved; // actual size
36 char *ptr; // allocated memory, zeroed
37 T* data; // start of n elements of aligned data
38 void create(int i); // create with size i
39 public:
Array(int i=0)40 explicit Array(int i=0) {create(i);}
41 ~Array();
operator [](int i)42 T& operator[](int i) {
43 #ifndef NDEBUG
44 if (i<0 || i>=n) fprintf(stderr, "%d out of bounds %d\n", i, n), exit(1);
45 #endif
46 return data[i];
47 }
operator [](int i) const48 const T& operator[](int i) const {
49 #ifndef NDEBUG
50 if (i<0 || i>=n) fprintf(stderr, "%d out of bounds %d\n", i, n), exit(1);
51 #endif
52 return data[i];
53 }
size() const54 int size() const {return n;}
55 void resize(int i); // change size to i
56 private:
57 Array(const Array&); // no copy or assignment
58 Array& operator=(const Array&);
59 };
60
resize(int i)61 template<class T, int ALIGN> void Array<T, ALIGN>::resize(int i) {
62 if (i<=reserved) {
63 n=i;
64 return;
65 }
66 char *saveptr=ptr;
67 T *savedata=data;
68 int saven=n;
69 create(i);
70 if (savedata && saveptr) {
71 memcpy(data, savedata, sizeof(T)*min(i, saven));
72 free(saveptr);
73 }
74 }
75
create(int i)76 template<class T, int ALIGN> void Array<T, ALIGN>::create(int i) {
77 n=reserved=i;
78 if (i<=0) {
79 data=0;
80 ptr=0;
81 return;
82 }
83 const int sz=ALIGN+n*sizeof(T);
84 ptr = (char*)calloc(sz, 1);
85 if (!ptr) fprintf(stderr, "Out of memory\n"), exit(1);
86 data = (ALIGN ? (T*)(ptr+ALIGN-(((long)ptr)&(ALIGN-1))) : (T*)ptr);
87 assert((char*)data>=ptr && (char*)data<=ptr+ALIGN);
88 }
89
~Array()90 template<class T, int ALIGN> Array<T, ALIGN>::~Array() {
91 free(ptr);
92 }
93
94 ///////////////////////////// Encoder ///////////////////////////
95
96 typedef enum {COMPRESS, DECOMPRESS} Mode;
97 class Encoder {
98 Mode mode;
99 FILE *archive;
100 public:
Encoder(Mode m,FILE * f)101 Encoder(Mode m, FILE* f): mode(m), archive(f) {}
getMode() const102 Mode getMode() const {return mode;}
size() const103 long size() const {return ftell(archive);} // length of archive so far
flush()104 void flush() {} // call this when compression is finished
105
106 // Compress one byte
compress(int c)107 void compress(int c) {
108 assert(mode==COMPRESS);
109 putc(c, archive);
110 }
111
112 // Decompress and return one byte
decompress()113 int decompress() {
114 assert(mode==DECOMPRESS);
115 return getc(archive);
116 }
117 };
118
119 ///////////////////////////////// BWT //////////////////////////////
120
121 // Globals
122 bool fast=false; // transform method: fast uses 5x blocksize memory, slow uses 5x/4
123 int blockSize=0x400000; // max BWT block size
124 int n=0; // number of elements in block, 0 < n <= blockSize
125 Array<U8> block; // [n] text to transform
126 Array<int> ptr; // [n] or [n/16] indexes into block to sort
127 const int PAD=72; // extra bytes in block (copy of beginning)
128 int pos=0; // bytes compressed/decompressed so far
129 bool quiet=false; // q option?
130
131 // true if block[a+1...] < block[b+1...] wrapping at n
lessthan(int a,int b)132 inline bool lessthan(int a, int b) {
133 if (a<0) return false;
134 if (b<0) return true;
135 int r=block[a+1]-block[b+1]; // an optimization
136 if (r) return r<0;
137 r=memcmp(&block[a+2], &block[b+2], PAD-8);
138 if (r) return r<0;
139 if (a<b) {
140 int r=memcmp(&block[a+1], &block[b+1], n-b-1);
141 if (r) return r<0;
142 r=memcmp(&block[a+n-b], &block[0], b-a);
143 if (r) return r<0;
144 return memcmp(&block[0], &block[b-a], a)<0;
145 }
146 else {
147 int r=memcmp(&block[a+1], &block[b+1], n-a-1);
148 if (r) return r<0;
149 r=memcmp(&block[0], &block[b+n-a], a-b);
150 if (r) return r<0;
151 return memcmp(&block[a-b], &block[0], b)<0;
152 }
153
154 }
155
156 // read 4 byte value LSB first, or -1 at EOF
read4(FILE * f)157 int read4(FILE* f) {
158 unsigned int r=getc(f);
159 r|=getc(f)<<8;
160 r|=getc(f)<<16;
161 r|=getc(f)<<24;
162 return r;
163 }
164
165 // read n<=blockSize bytes from in to block, BWT, write to out
encodeBlock(FILE * in,Encoder & en)166 int encodeBlock(FILE* in, Encoder& en) {
167 n=fread(&block[0], 1, blockSize, in); // n = actual block size
168 if (n<1) return 0;
169 assert(block.size()>=n+PAD);
170 for (int i=0; i<PAD; ++i) block[i+n]=block[i];
171
172 // fast mode: sort the pointers to the block
173 if (fast) {
174 if (!quiet) printf("sorting %10d to %10d \r", pos, pos+n);
175 assert(ptr.size()>=n);
176 for (int i=0; i<n; ++i) ptr[i]=i;
177 stable_sort(&ptr[0], &ptr[n], lessthan); // faster than sort() or qsort()
178 int p=min_element(&ptr[0], &ptr[n])-&ptr[0];
179 en.compress(n>>24);
180 en.compress(n>>16);
181 en.compress(n>>8);
182 en.compress(n);
183 en.compress(p>>24);
184 en.compress(p>>16);
185 en.compress(p>>8);
186 en.compress(p);
187 if (!quiet) printf("compressing %10d to %10d \r", pos, pos+n);
188 for (int i=0; i<n; ++i) {
189 en.compress(block[ptr[i]]);
190 if (!quiet && i && (i&0xffff)==0)
191 printf("compressed %10d of %10d \r", pos+i, pos+n);
192 }
193 pos+=n;
194 return n;
195 }
196
197 // slow mode: divide the block into 16 parts, sort them, write the pointers
198 // to temporary files, then merge them.
199 else {
200
201 // write header
202 if (!quiet) printf("writing header at %10d \r", pos);
203 int p=0;
204 for (int i=1; i<n; ++i)
205 if (lessthan(i, 0)) ++p;
206 en.compress(n>>24);
207 en.compress(n>>16);
208 en.compress(n>>8);
209 en.compress(n);
210 en.compress(p>>24);
211 en.compress(p>>16);
212 en.compress(p>>8);
213 en.compress(p);
214
215 // sort pointers in 16 parts to temporary files
216 const int subBlockSize = (n-1)/16+1; // max size of sub-block
217 int start=0, end=subBlockSize; // range of current sub-block
218 FILE* tmp[16]; // temporary files
219 for (int i=0; i<16; ++i) {
220 if (!quiet) printf("sorting %10d to %10d \r", pos+start, pos+end);
221 tmp[i]=tmpfile();
222 if (!tmp[i]) perror("tmpfile()"), exit(1);
223 for (int j=start; j<end; ++j) ptr[j-start]=j;
224 stable_sort(&ptr[0], &ptr[end-start], lessthan);
225 for (int j=start; j<end; ++j) { // write pointers
226 int c=ptr[j-start];
227 fprintf(tmp[i], "%c%c%c%c", c, c>>8, c>>16, c>>24);
228 }
229 start=end;
230 end+=subBlockSize;
231 if (end>n) end=n;
232 }
233
234 // merge sorted pointers
235 if (!quiet) printf("merging %10d to %10d \r", pos, pos+n);
236 unsigned int t[16]; // current pointers
237 for (int i=0; i<16; ++i) { // init t
238 rewind(tmp[i]);
239 t[i]=read4(tmp[i]);
240 }
241 for (int i=0; i<n; ++i) { // merge and compress
242 int j=min_element(t, t+16, lessthan)-t;
243 en.compress(block[t[j]]);
244 if (!quiet && i && (i&0xffff)==0)
245 printf("compressed %10d of %10d \r", pos+i, pos+n);
246 t[j]=read4(tmp[j]);
247 }
248 for (int i=0; i<16; ++i) // delete tmp files
249 fclose(tmp[i]);
250 pos+=n;
251 return n;
252 }
253 }
254
255 // forward BWT
encode(FILE * in,Encoder & en)256 void encode(FILE* in, Encoder& en) {
257 block.resize(blockSize+PAD);
258 if (fast) ptr.resize(blockSize+1);
259 else ptr.resize((blockSize-1)/16+2);
260 while (encodeBlock(in, en));
261 /* en.compress(0); // mark EOF
262 en.compress(0);
263 en.compress(0);
264 en.compress(0);*/
265 }
266
267 // inverse BWT of one block
decodeBlock(Encoder & en,FILE * out)268 int decodeBlock(Encoder& en, FILE* out) {
269
270 // read block size
271 int n=en.decompress();
272 n=n*256+en.decompress();
273 n=n*256+en.decompress();
274 n=n*256+en.decompress();
275 if (n==0) return n;
276 if (!blockSize) { // first block? allocate memory
277 blockSize = n;
278 if (!quiet) printf("block size = %d\n", blockSize);
279 block.resize(blockSize+PAD);
280 if (fast) ptr.resize(blockSize);
281 else ptr.resize(blockSize/16+256);
282 }
283 else if (n<1 || n>blockSize) {
284 printf("file corrupted: block=%d max=%d\n", n, blockSize);
285 exit(1);
286 }
287
288 // read pointer to first byte
289 int p=en.decompress();
290 p=p*256+en.decompress();
291 p=p*256+en.decompress();
292 p=p*256+en.decompress();
293 if (p<0 || p>=n) {
294 printf("file corrupted: p=%d n=%d\n", p, n);
295 exit(1);
296 }
297
298 // decompress and read block
299 for (int i=0; i<n; ++i) {
300 block[i]=en.decompress();
301 if (!quiet && i && (i&0xffff)==0)
302 printf("decompressed %10d of %10d \r", pos+i, pos+n);
303 }
304 for (int i=0; i<PAD; ++i) block[i+n]=block[i]; // circular pad
305
306 // count (sort) bytes
307 if (!quiet) printf("unsorting %10d to %10d \r", pos, pos+n);
308 Array<int> t(257); // i -> number of bytes < i in block
309 for (int i=0; i<n; ++i)
310 ++t[block[i]+1];
311 for (int i=1; i<257; ++i)
312 t[i]+=t[i-1];
313 assert(t[256]==n);
314
315 // fast mode: build linked list
316 if (fast) {
317 for (int i=0; i<n; ++i)
318 ptr[t[block[i]]++]=i;
319 assert(t[255]==n);
320
321 // traverse list
322 for (int i=0; i<n; ++i) {
323 assert(p>=0 && p<n);
324 putc(block[p], out);
325 p=ptr[p];
326 }
327 return n;
328 }
329
330 // slow: build ptr[t[c]+c+i] = position of i*16'th occurrence of c in block
331 Array<int> count(256); // i -> count of i in block
332 for (int i=0; i<n; ++i) {
333 int c=block[i];
334 if ((count[c]++ & 15)==0)
335 ptr[(t[c]>>4)+c+(count[c]>>4)]=i;
336 }
337
338 // decode
339 int c=block[p];
340 for (int i=0; i<n; ++i) {
341 assert(p>=0 && p<n);
342 putc(c, out);
343
344 // find next c by binary search in t so that t[c] <= p < t[c+1]
345 c=127;
346 int d=64;
347 while (d) {
348 if (t[c]>p) c-=d;
349 else if (t[c+1]<=p) c+=d;
350 else break;
351 d>>=1;
352 }
353 if (c==254 && t[255]<=p && p<t[256]) c=255;
354 assert(c>=0 && c<256 && t[c]<=p && p<t[c+1]);
355
356 // find approximate position of p
357 int offset=p-t[c];
358 const U8* q=&block[ptr[(t[c]>>4)+c+(offset>>4)]]; // start of linear search
359 offset&=15;
360
361 // find next p by linear search for offset'th occurrence of c in block
362 while (offset--)
363 if (*++q != c) q=(const U8*)memchr(q, c, &block[n]-q);
364 assert(q && q>=&block[0] && q<&block[n]);
365 p=q-&block[0];
366 }
367 pos+=n;
368 return n;
369 }
370
371 // inverse BWT of file
decode(Encoder & en,FILE * out)372 void decode(Encoder& en, FILE* out) {
373 while (decodeBlock(en, out));
374 }
375
376 /////////////////////////////// main ////////////////////////////
377
main(int argc,char ** argv)378 int main(int argc, char** argv) {
379 clock_t start=clock();
380
381 // check for args
382 if (argc<4) {
383 printf("bwt Big Block BWT file encoder, ver. 1\n"
384 "(C) 2009, Matt Mahoney. Free under GPL, http://www.gnu.org/licenses/gpl.txt\n"
385 "\n"
386 "To encode a file: bbb command input output\n"
387 "\n"
388 "Commands:\n"
389 "c = code (default), d = decode.\n"
390 "f = fast mode, needs 5x block size memory, default uses 1.25x block size.\n"
391 "q = quiet (no output except error messages).\n"
392 "bN, kN, mN = use block size N bytes, KiB, MiB, default = m4 (compression only).\n"
393 "\n"
394 "Commands should be concatenated in any order, e.g. bwt cfm100q foo foo.bwt\n"
395 "means code foo to foo.bwt in fast mode using 100 MiB block size in quiet\n"
396 "mode.\n");
397 exit(0);
398 }
399
400 // read options
401 Mode mode=COMPRESS;
402 const char* p=argv[1];
403 while (*p) {
404 switch (*p) {
405 case 'c': mode=COMPRESS; break;
406 case 'd': mode=DECOMPRESS; break;
407 case 'f': fast=true; break;
408 case 'b': blockSize=atoi(p+1); break;
409 case 'k': blockSize=atoi(p+1)<<10; break;
410 case 'm': blockSize=atoi(p+1)<<20; break;
411 case 'q': quiet=true; break;
412 }
413 ++p;
414 }
415 if (blockSize<1) printf("Block size must be at least 1\n"), exit(1);
416
417 // open files
418 FILE* in=fopen(argv[2], "rb");
419 if (!in) perror(argv[2]), exit(1);
420 FILE* out=fopen(argv[3], "wb");
421 if (!out) perror(argv[3]), exit(1);
422
423 // encode or decode
424 if (mode==COMPRESS) {
425 if (!quiet) printf("Compressing %s to %s in %s mode, block size = %d\n",
426 argv[2], argv[3], fast ? "fast" : "slow", blockSize);
427 Encoder en(COMPRESS, out);
428 encode(in, en);
429 en.flush();
430 }
431 else if (mode==DECOMPRESS) {
432 blockSize=0;
433 if (!quiet) printf("Decompressing %s to %s in %s mode\n",
434 argv[2], argv[3], fast ? "fast" : "slow");
435 Encoder en(DECOMPRESS, in);
436 decode(en, out);
437 }
438 if (!quiet) printf("%ld -> %ld in %1.2f sec \n", ftell(in), ftell(out),
439 (clock()-start+0.0)/CLOCKS_PER_SEC);
440 return 0;
441 }
442
443