1 // zpaq.cpp - Journaling incremental deduplicating archiver
2
3 #define ZPAQ_VERSION "6.57"
4
5 /* Copyright (C) 2009-2014, Dell Inc. Written by Matt Mahoney.
6
7 LICENSE
8
9 This program is free software; you can redistribute it and/or
10 modify it under the terms of the GNU General Public License as
11 published by the Free Software Foundation; either version 3 of
12 the License, or (at your option) any later version.
13
14 This program is distributed in the hope that it will be useful, but
15 WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 General Public License for more details at
18 Visit <http://www.gnu.org/copyleft/gpl.html>.
19
20 divsufsort.c from libdivsufsort-lite is (C) 2003-2008, Yuta Mori
21 and is embedded in this file. It is licensed under the MIT license
22 described below.
23
24 zpaq is a journaling (append-only) archiver for incremental backups.
25 Files are added only when the last-modified date has changed. Both the old
26 and new versions are saved. You can extract from old versions of the
27 archive by specifying a date or version number. zpaq supports 5
28 compression levels, deduplication, AES-256 encryption, and multi-threading
29 using an open, self-describing format for backward and forward
30 compatibility in Windows and Linux. See zpaq.pod for usage.
31
32 TO COMPILE:
33
34 This program needs libzpaq from http://mattmahoney.net/zpaq/ and
35 libdivsufsort-lite from above or http://code.google.com/p/libdivsufsort/
36 Recommended compile for Windows with MinGW:
37
38 g++ -O3 zpaq.cpp libzpaq.cpp -o zpaq
39
40 With Visual C++:
41
42 cl /O2 /EHsc zpaq.cpp libzpaq.cpp advapi32.lib
43
44 For Linux:
45
46 g++ -O3 -Dunix zpaq.cpp libzpaq.cpp -pthread -o zpaq
47
48 Possible options:
49
50 -o Name of output executable.
51 -O3 or /O2 Optimize (faster).
52 /EHsc Enable exception handing in VC++ (required).
53 -s Strip debugging symbols. Smaller executable.
54 /arch:SSE2 Assume x86 processor with SSE2. Otherwise use -DNOJIT.
55 -msse2 Same. Implied by -m64 for a x86-64 target.
56 -DNOJIT Don't assume x86 with SSE2 for libzpaq. Slower (disables JIT).
57 -static Don't assume C++ runtime on target. Bigger executable but safer.
58 -Dunix Not Windows. Sometimes automatic in Linux. Needed for Mac OS/X.
59 -fopenmp Parallel divsufsort (faster, implies -pthread, broken in MinGW).
60 -pthread Required in Linux, implied by -fopenmp.
61 -DDEBUG Turn on debugging checks.
62 -DPTHREAD Use Pthreads instead of Windows threads. Requires pthreadGC2.dll
63 or pthreadVC2.dll from http://sourceware.org/pthreads-win32/
64 -Dunixtest To make -Dunix work in Windows with MinGW.
65 -Wl,--large-address-aware To make 3 GB available in 32 bit Windows.
66
67 */
68 #define _FILE_OFFSET_BITS 64 // In Linux make sizeof(off_t) == 8
69 #define UNICODE // For Windows
70 #include "libzpaq.h"
71 #include <stdio.h>
72 #include <stdlib.h>
73 #include <string.h>
74 #include <ctype.h>
75 #include <time.h>
76 #include <stdint.h>
77 #include <string>
78 #include <vector>
79 #include <map>
80 #include <algorithm>
81 #include <stdexcept>
82 #include <fcntl.h>
83
84 #ifndef DEBUG
85 #define NDEBUG 1
86 #endif
87 #include <assert.h>
88
89 #ifdef unix
90 #define PTHREAD 1
91 #include <sys/types.h>
92 #include <sys/stat.h>
93 #include <sys/time.h>
94 #include <unistd.h>
95 #include <dirent.h>
96 #include <utime.h>
97 #include <errno.h>
98
99 #ifdef unixtest
100 struct termios {
101 int c_lflag;
102 };
103 #define ECHO 1
104 #define ECHONL 2
105 #define TCSANOW 4
tcgetattr(int,termios *)106 int tcgetattr(int, termios*) {return 0;}
tcsetattr(int,int,termios *)107 int tcsetattr(int, int, termios*) {return 0;}
108 #else
109 #include <termios.h>
110 #endif
111
112 #else // Assume Windows
113 #include <windows.h>
114 #include <wincrypt.h>
115 #include <io.h>
116 #endif
117
118 using std::string;
119 using std::vector;
120 using std::map;
121 using std::min;
122 using std::max;
123
124 // Handle errors in libzpaq and elsewhere
error(const char * msg)125 void libzpaq::error(const char* msg) {
126 fprintf(stderr, "zpaq error: %s\n", msg);
127 if (strstr(msg, "ut of memory")) throw std::bad_alloc();
128 throw std::runtime_error(msg);
129 }
130 using libzpaq::error;
131
132 // Portable thread types and functions for Windows and Linux. Use like this:
133 //
134 // // Create mutex for locking thread-unsafe code
135 // Mutex mutex; // shared by all threads
136 // init_mutex(mutex); // initialize in unlocked state
137 // Semaphore sem(n); // n >= 0 is initial state
138 //
139 // // Declare a thread function
140 // ThreadReturn thread(void *arg) { // arg points to in/out parameters
141 // lock(mutex); // wait if another thread has it first
142 // release(mutex); // allow another waiting thread to continue
143 // sem.wait(); // wait until n>0, then --n
144 // sem.signal(); // ++n to allow waiting threads to continue
145 // return 0; // must return 0 to exit thread
146 // }
147 //
148 // // Start a thread
149 // ThreadID tid;
150 // run(tid, thread, &arg); // runs in parallel
151 // join(tid); // wait for thread to return
152 // destroy_mutex(mutex); // deallocate resources used by mutex
153 // sem.destroy(); // deallocate resources used by semaphore
154
155 #ifdef PTHREAD
156 #include <pthread.h>
157 typedef void* ThreadReturn; // job return type
158 typedef pthread_t ThreadID; // job ID type
run(ThreadID & tid,ThreadReturn (* f)(void *),void * arg)159 void run(ThreadID& tid, ThreadReturn(*f)(void*), void* arg)// start job
160 {pthread_create(&tid, NULL, f, arg);}
join(ThreadID tid)161 void join(ThreadID tid) {pthread_join(tid, NULL);} // wait for job
162 typedef pthread_mutex_t Mutex; // mutex type
init_mutex(Mutex & m)163 void init_mutex(Mutex& m) {pthread_mutex_init(&m, 0);} // init mutex
lock(Mutex & m)164 void lock(Mutex& m) {pthread_mutex_lock(&m);} // wait for mutex
release(Mutex & m)165 void release(Mutex& m) {pthread_mutex_unlock(&m);} // release mutex
destroy_mutex(Mutex & m)166 void destroy_mutex(Mutex& m) {pthread_mutex_destroy(&m);} // destroy mutex
167
168 class Semaphore {
169 public:
Semaphore()170 Semaphore() {sem=-1;}
init(int n)171 void init(int n) {
172 assert(n>=0);
173 assert(sem==-1);
174 pthread_cond_init(&cv, 0);
175 pthread_mutex_init(&mutex, 0);
176 sem=n;
177 }
destroy()178 void destroy() {
179 assert(sem>=0);
180 pthread_mutex_destroy(&mutex);
181 pthread_cond_destroy(&cv);
182 }
wait()183 int wait() {
184 assert(sem>=0);
185 pthread_mutex_lock(&mutex);
186 int r=0;
187 if (sem==0) r=pthread_cond_wait(&cv, &mutex);
188 assert(sem>0);
189 --sem;
190 pthread_mutex_unlock(&mutex);
191 return r;
192 }
signal()193 void signal() {
194 assert(sem>=0);
195 pthread_mutex_lock(&mutex);
196 ++sem;
197 pthread_cond_signal(&cv);
198 pthread_mutex_unlock(&mutex);
199 }
200 private:
201 pthread_cond_t cv; // to signal FINISHED
202 pthread_mutex_t mutex; // protects cv
203 int sem; // semaphore count
204 };
205
206 #else // Windows
207 typedef DWORD ThreadReturn;
208 typedef HANDLE ThreadID;
run(ThreadID & tid,ThreadReturn (* f)(void *),void * arg)209 void run(ThreadID& tid, ThreadReturn(*f)(void*), void* arg) {
210 tid=CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)f, arg, 0, NULL);
211 if (tid==NULL) error("CreateThread failed");
212 }
join(ThreadID & tid)213 void join(ThreadID& tid) {WaitForSingleObject(tid, INFINITE);}
214 typedef HANDLE Mutex;
init_mutex(Mutex & m)215 void init_mutex(Mutex& m) {m=CreateMutex(NULL, FALSE, NULL);}
lock(Mutex & m)216 void lock(Mutex& m) {WaitForSingleObject(m, INFINITE);}
release(Mutex & m)217 void release(Mutex& m) {ReleaseMutex(m);}
destroy_mutex(Mutex & m)218 void destroy_mutex(Mutex& m) {CloseHandle(m);}
219
220 class Semaphore {
221 public:
222 enum {MAXCOUNT=2000000000};
Semaphore()223 Semaphore(): h(NULL) {}
init(int n)224 void init(int n) {assert(!h); h=CreateSemaphore(NULL, n, MAXCOUNT, NULL);}
destroy()225 void destroy() {assert(h); CloseHandle(h);}
wait()226 int wait() {assert(h); return WaitForSingleObject(h, INFINITE);}
signal()227 void signal() {assert(h); ReleaseSemaphore(h, 1, NULL);}
228 private:
229 HANDLE h; // Windows semaphore
230 };
231
232 #endif
233
234 #ifdef _MSC_VER // Microsoft C++
235 #define fseeko(a,b,c) _fseeki64(a,b,c)
236 #define ftello(a) _ftelli64(a)
237 #else
238 #ifndef unix
239 #ifndef fseeko
240 #define fseeko(a,b,c) fseeko64(a,b,c)
241 #endif
242 #ifndef ftello
243 #define ftello(a) ftello64(a)
244 #endif
245 #endif
246 #endif
247
248 // For testing -Dunix in Windows
249 #ifdef unixtest
250 #define lstat(a,b) stat(a,b)
251 #define mkdir(a,b) mkdir(a)
252 #ifndef fseeko
253 #define fseeko(a,b,c) fseeko64(a,b,c)
254 #endif
255 #ifndef ftello
256 #define ftello(a) ftello64(a)
257 #endif
258 #endif
259
260 // Global variables
261 FILE* con=stdout; // log output, can be stderr
262 bool fragile=false; // -fragile option
263 int64_t quiet=-1; // -quiet option
264 static const int64_t MAX_QUIET=0x7FFFFFFFFFFFFFFFLL; // no output but errors
265 int64_t global_start=0; // set to mtime() at start of main()
266
267 // signed size of a string or vector
size(const T & x)268 template <typename T> int size(const T& x) {
269 return x.size();
270 }
271
272 // In Windows, convert 16-bit wide string to UTF-8 and \ to /
273 #ifndef unix
wtou(const wchar_t * s)274 string wtou(const wchar_t* s) {
275 assert(sizeof(wchar_t)==2); // Not true in Linux
276 assert((wchar_t)(-1)==65535);
277 string r;
278 if (!s) return r;
279 for (; *s; ++s) {
280 if (*s=='\\') r+='/';
281 else if (*s<128) r+=*s;
282 else if (*s<2048) r+=192+*s/64, r+=128+*s%64;
283 else r+=224+*s/4096, r+=128+*s/64%64, r+=128+*s%64;
284 }
285 return r;
286 }
287
288 // In Windows, convert UTF-8 string to wide string ignoring
289 // invalid UTF-8 or >64K. If doslash then convert "/" to "\".
utow(const char * ss,bool doslash=false)290 std::wstring utow(const char* ss, bool doslash=false) {
291 assert(sizeof(wchar_t)==2);
292 assert((wchar_t)(-1)==65535);
293 std::wstring r;
294 if (!ss) return r;
295 const unsigned char* s=(const unsigned char*)ss;
296 for (; s && *s; ++s) {
297 if (s[0]=='/' && doslash) r+='\\';
298 else if (s[0]<128) r+=s[0];
299 else if (s[0]>=192 && s[0]<224 && s[1]>=128 && s[1]<192)
300 r+=(s[0]-192)*64+s[1]-128, ++s;
301 else if (s[0]>=224 && s[0]<240 && s[1]>=128 && s[1]<192
302 && s[2]>=128 && s[2]<192)
303 r+=(s[0]-224)*4096+(s[1]-128)*64+s[2]-128, s+=2;
304 }
305 return r;
306 }
307 #endif
308
309 // Print a UTF-8 string to f (stdout, stderr) so it displays properly
printUTF8(const char * s,FILE * f)310 void printUTF8(const char* s, FILE* f) {
311 assert(f);
312 assert(s);
313 #ifdef unix
314 fprintf(f, "%s", s);
315 #else
316 const HANDLE h=(HANDLE)_get_osfhandle(_fileno(f));
317 DWORD ft=GetFileType(h);
318 if (ft==FILE_TYPE_CHAR) {
319 fflush(f);
320 std::wstring w=utow(s); // Windows console: convert to UTF-16
321 DWORD n=0;
322 WriteConsole(h, w.c_str(), w.size(), &n, 0);
323 }
324 else // stdout redirected to file
325 fprintf(f, "%s", s);
326 #endif
327 }
328
329 // Return relative time in milliseconds
mtime()330 int64_t mtime() {
331 #ifdef unix
332 timeval tv;
333 gettimeofday(&tv, 0);
334 return tv.tv_sec*1000LL+tv.tv_usec/1000;
335 #else
336 int64_t t=GetTickCount();
337 if (t<global_start) t+=0x100000000LL;
338 return t;
339 #endif
340 }
341
342 // Convert 64 bit decimal YYYYMMDDHHMMSS to "YYYY-MM-DD HH:MM:SS"
343 // where -1 = unknown date, 0 = deleted.
dateToString(int64_t date)344 string dateToString(int64_t date) {
345 if (date<=0) return " ";
346 string s="0000-00-00 00:00:00";
347 static const int t[]={18,17,15,14,12,11,9,8,6,5,3,2,1,0};
348 for (int i=0; i<14; ++i) s[t[i]]+=int(date%10), date/=10;
349 return s;
350 }
351
352 // Convert 'u'+(N*256) to octal N or 'w'+(N*256) to hex N or "DRASHI"
attrToString(int64_t attrib)353 string attrToString(int64_t attrib) {
354 string r=" ";
355 if ((attrib&255)=='u') {
356 for (int i=0; i<6; ++i)
357 r[5-i]=(attrib>>(8+3*i))%8+'0';
358 }
359 else if ((attrib&255)=='w') {
360 attrib>>=8;
361 if (attrib&~0x20b7) { // non-standard flags set?
362 r="0x ";
363 for (int i=0; i<4; ++i)
364 r[5-i]="0123456789abcdef"[attrib>>(4*i)&15];
365 if (attrib>0x10000) {
366 r="0x ";
367 for (int i=0; i<8; ++i)
368 r[9-i]="0123456789abcdef"[attrib>>(4*i)&15];
369 }
370 }
371 else {
372 r="......";
373 if (attrib&0x10) r[0]='D'; // directory
374 if (attrib&0x20) r[1]='A'; // archive
375 if (attrib&0x04) r[2]='S'; // system
376 if (attrib&0x02) r[3]='H'; // hidden
377 if (attrib&0x01) r[4]='R'; // read only
378 if (attrib&0x2000) r[5]='I'; // index
379 }
380 }
381 return r;
382 }
383
384 // Convert seconds since 0000 1/1/1970 to 64 bit decimal YYYYMMDDHHMMSS
385 // Valid from 1970 to 2099.
decimal_time(time_t t)386 int64_t decimal_time(time_t t) {
387 if (t<=0) return -1;
388 const int second=t%60;
389 const int minute=t/60%60;
390 const int hour=t/3600%24;
391 t/=86400; // days since Jan 1 1970
392 const int term=t/1461; // 4 year terms since 1970
393 t%=1461;
394 t+=(t>=59); // insert Feb 29 on non leap years
395 t+=(t>=425);
396 t+=(t>=1157);
397 const int year=term*4+t/366+1970; // actual year
398 t%=366;
399 t+=(t>=60)*2; // make Feb. 31 days
400 t+=(t>=123); // insert Apr 31
401 t+=(t>=185); // insert June 31
402 t+=(t>=278); // insert Sept 31
403 t+=(t>=340); // insert Nov 31
404 const int month=t/31+1;
405 const int day=t%31+1;
406 return year*10000000000LL+month*100000000+day*1000000
407 +hour*10000+minute*100+second;
408 }
409
410 // Convert decimal date to time_t - inverse of decimal_time()
unix_time(int64_t date)411 time_t unix_time(int64_t date) {
412 if (date<=0) return -1;
413 static const int days[12]={0,31,59,90,120,151,181,212,243,273,304,334};
414 const int year=date/10000000000LL%10000;
415 const int month=(date/100000000%100-1)%12;
416 const int day=date/1000000%100;
417 const int hour=date/10000%100;
418 const int min=date/100%100;
419 const int sec=date%100;
420 return (day-1+days[month]+(year%4==0 && month>1)+((year-1970)*1461+1)/4)
421 *86400+hour*3600+min*60+sec;
422 }
423
424 // Put n cryptographic random bytes in buf[0..n-1].
425 // The first byte will not be 'z' or '7' (start of a ZPAQ archive).
426 // For a pure random number, discard the first byte.
427
random(char * buf,int n)428 void random(char* buf, int n) {
429 #ifdef unix
430 FILE* in=fopen("/dev/urandom", "rb");
431 if (in && fread(buf, 1, n, in)==n)
432 fclose(in);
433 else {
434 perror("/dev/urandom");
435 error("key generation failed");
436 }
437 #else
438 HCRYPTPROV h;
439 if (CryptAcquireContext(&h, NULL, NULL, PROV_RSA_FULL,
440 CRYPT_VERIFYCONTEXT) && CryptGenRandom(h, n, (BYTE*)buf))
441 CryptReleaseContext(h, 0);
442 else {
443 fprintf(stderr, "CryptGenRandom: error %d\n", int(GetLastError()));
444 error("key generation failed");
445 }
446 #endif
447 if (n>=1 && (buf[0]=='z' || buf[0]=='7'))
448 buf[0]^=0x80;
449 }
450
451 /////////////////////////////// File //////////////////////////////////
452
453 // Convert non-negative decimal number x to string of at least n digits
itos(int64_t x,int n=1)454 string itos(int64_t x, int n=1) {
455 assert(x>=0);
456 assert(n>=0);
457 string r;
458 for (; x || n>0; x/=10, --n) r=string(1, '0'+x%10)+r;
459 return r;
460 }
461
462 // Replace * and ? in fn with part or digits of part
subpart(string fn,int part)463 string subpart(string fn, int part) {
464 for (int j=fn.size()-1; j>=0; --j) {
465 if (fn[j]=='?')
466 fn[j]='0'+part%10, part/=10;
467 else if (fn[j]=='*')
468 fn=fn.substr(0, j)+itos(part)+fn.substr(j+1), part=0;
469 }
470 return fn;
471 }
472
473 // Return true if a file or directory (UTF-8 without trailing /) exists.
474 // If part>0 then replace * and ? in filename with part or its digits.
exists(string filename,int part=0)475 bool exists(string filename, int part=0) {
476 if (part>0) filename=subpart(filename, part);
477 int len=filename.size();
478 if (len<1) return false;
479 if (filename[len-1]=='/') filename=filename.substr(0, len-1);
480 #ifdef unix
481 struct stat sb;
482 return !lstat(filename.c_str(), &sb);
483 #else
484 return GetFileAttributes(utow(filename.c_str(), true).c_str())
485 !=INVALID_FILE_ATTRIBUTES;
486 #endif
487 }
488
489 // Delete a file, return true if successful
delete_file(const char * filename)490 bool delete_file(const char* filename) {
491 #ifdef unix
492 return remove(filename)==0;
493 #else
494 return DeleteFile(utow(filename, true).c_str());
495 #endif
496 }
497
498 #ifndef unix
499
500 // Print error message
winError(const char * filename)501 void winError(const char* filename) {
502 int err=GetLastError();
503 printUTF8(filename, stderr);
504 if (err==ERROR_FILE_NOT_FOUND)
505 fprintf(stderr, ": file not found\n");
506 else if (err==ERROR_PATH_NOT_FOUND)
507 fprintf(stderr, ": path not found\n");
508 else if (err==ERROR_ACCESS_DENIED)
509 fprintf(stderr, ": access denied\n");
510 else if (err==ERROR_SHARING_VIOLATION)
511 fprintf(stderr, ": sharing violation\n");
512 else if (err==ERROR_BAD_PATHNAME)
513 fprintf(stderr, ": bad pathname\n");
514 else if (err==ERROR_INVALID_NAME)
515 fprintf(stderr, ": invalid name\n");
516 else
517 fprintf(stderr, ": Windows error %d\n", err);
518 }
519
520 // Set the last-modified date of an open file handle
setDate(HANDLE out,int64_t date)521 void setDate(HANDLE out, int64_t date) {
522 if (date>0) {
523 SYSTEMTIME st;
524 FILETIME ft;
525 st.wYear=date/10000000000LL%10000;
526 st.wMonth=date/100000000%100;
527 st.wDayOfWeek=0; // ignored
528 st.wDay=date/1000000%100;
529 st.wHour=date/10000%100;
530 st.wMinute=date/100%100;
531 st.wSecond=date%100;
532 st.wMilliseconds=0;
533 SystemTimeToFileTime(&st, &ft);
534 if (!SetFileTime(out, NULL, NULL, &ft))
535 fprintf(stderr, "SetFileTime error %d\n", int(GetLastError()));
536 }
537 }
538 #endif
539
540 // Create directories as needed. For example if path="/tmp/foo/bar"
541 // then create directories /, /tmp, and /tmp/foo unless they exist.
542 // Set date and attributes if not 0.
makepath(string path,int64_t date=0,int64_t attr=0)543 void makepath(string path, int64_t date=0, int64_t attr=0) {
544 for (int i=0; i<size(path); ++i) {
545 if (path[i]=='\\' || path[i]=='/') {
546 path[i]=0;
547 #ifdef unix
548 int ok=!mkdir(path.c_str(), 0777);
549 #else
550 int ok=CreateDirectory(utow(path.c_str(), true).c_str(), 0);
551 #endif
552 if (ok && quiet<=0) {
553 fprintf(con, "Created directory ");
554 printUTF8(path.c_str(), con);
555 fprintf(con, "\n");
556 }
557 path[i]='/';
558 }
559 }
560
561 // Set date and attributes
562 string filename=path;
563 if (filename!="" && filename[filename.size()-1]=='/')
564 filename=filename.substr(0, filename.size()-1); // remove trailing slash
565 #ifdef unix
566 if (date>0) {
567 struct utimbuf ub;
568 ub.actime=time(NULL);
569 ub.modtime=unix_time(date);
570 utime(filename.c_str(), &ub);
571 }
572 if ((attr&255)=='u')
573 chmod(filename.c_str(), attr>>8);
574 #else
575 for (int i=0; i<size(filename); ++i) // change to backslashes
576 if (filename[i]=='/') filename[i]='\\';
577 if (date>0) {
578 HANDLE out=CreateFile(utow(filename.c_str(), true).c_str(),
579 FILE_WRITE_ATTRIBUTES, 0, NULL, OPEN_EXISTING,
580 FILE_FLAG_BACKUP_SEMANTICS, NULL);
581 if (out!=INVALID_HANDLE_VALUE) {
582 setDate(out, date);
583 CloseHandle(out);
584 }
585 else winError(filename.c_str());
586 }
587 if ((attr&255)=='w') {
588 SetFileAttributes(utow(filename.c_str(), true).c_str(), attr>>8);
589 }
590 #endif
591 }
592
593 // Base class of InputFile and OutputFile (OS independent)
594 class File {
595 protected:
596 enum {BUFSIZE=1<<16}; // buffer size
597 int ptr; // next byte to read or write in buf
598 libzpaq::Array<char> buf; // I/O buffer
599 libzpaq::AES_CTR *aes; // if not NULL then encrypt
600 int64_t eoff; // extra offset for multi-file encryption
File()601 File(): ptr(0), buf(BUFSIZE), aes(0), eoff(0) {}
602 };
603
604 // File types accepting UTF-8 filenames
605 #ifdef unix
606
607 class InputFile: public File, public libzpaq::Reader {
608 FILE* in;
609 int n; // number of bytes in buf
610 public:
InputFile()611 InputFile(): in(0), n(0) {}
612
613 // Open file for reading. Return true if successful.
614 // If aes then encrypt with aes+eoff.
open(const char * filename,libzpaq::AES_CTR * a=0,int64_t e=0)615 bool open(const char* filename, libzpaq::AES_CTR* a=0, int64_t e=0) {
616 in=fopen(filename, "rb");
617 if (!in) perror(filename);
618 aes=a;
619 eoff=e;
620 n=ptr=0;
621 return in!=0;
622 }
623
624 // True if open
isopen()625 bool isopen() {return in!=0;}
626
627 // Read and return 1 byte (0..255) or EOF
get()628 int get() {
629 assert(in);
630 if (ptr>=n) {
631 assert(ptr==n);
632 n=fread(&buf[0], 1, BUFSIZE, in);
633 ptr=0;
634 if (aes) {
635 int64_t off=tell()+eoff;
636 if (off<32) error("attempt to read salt");
637 aes->encrypt(&buf[0], n, off);
638 }
639 if (!n) return EOF;
640 }
641 assert(ptr<n);
642 return buf[ptr++]&255;
643 }
644
645 // Return file position
tell()646 int64_t tell() {
647 return ftello(in)-n+ptr;
648 }
649
650 // Set file position
seek(int64_t pos,int whence)651 void seek(int64_t pos, int whence) {
652 if (whence==SEEK_CUR) {
653 whence=SEEK_SET;
654 pos+=tell();
655 }
656 fseeko(in, pos, whence);
657 n=ptr=0;
658 }
659
660 // Close file if open
close()661 void close() {if (in) fclose(in), in=0;}
~InputFile()662 ~InputFile() {close();}
663 };
664
665 class OutputFile: public File, public libzpaq::Writer {
666 FILE* out;
667 string filename;
668 public:
OutputFile()669 OutputFile(): out(0) {}
670
671 // Return true if file is open
isopen()672 bool isopen() {return out!=0;}
673
674 // Open for append/update or create if needed.
675 // If aes then encrypt with aes+eoff.
open(const char * filename,libzpaq::AES_CTR * a=0,int64_t e=0)676 bool open(const char* filename, libzpaq::AES_CTR* a=0, int64_t e=0) {
677 assert(!isopen());
678 ptr=0;
679 this->filename=filename;
680 out=fopen(filename, "rb+");
681 if (!out) out=fopen(filename, "wb+");
682 if (!out) perror(filename);
683 aes=a;
684 eoff=e;
685 if (out) fseeko(out, 0, SEEK_END);
686 return isopen();
687 }
688
689 // Flush pending output
flush()690 void flush() {
691 if (ptr) {
692 assert(isopen());
693 assert(ptr>0 && ptr<=BUFSIZE);
694 if (aes) {
695 int64_t off=ftello(out)+eoff;
696 if (off<32) error("attempt to overwrite salt");
697 aes->encrypt(&buf[0], ptr, off);
698 }
699 int n=fwrite(&buf[0], 1, ptr, out);
700 if (n!=ptr) {
701 perror(filename.c_str());
702 error("write failed");
703 }
704 ptr=0;
705 }
706 }
707
708 // Write 1 byte
put(int c)709 void put(int c) {
710 assert(isopen());
711 if (ptr>=BUFSIZE) {
712 assert(ptr==BUFSIZE);
713 flush();
714 }
715 assert(ptr>=0 && ptr<BUFSIZE);
716 buf[ptr++]=c;
717 }
718
719 // Write bufp[0..size-1]
720 void write(const char* bufp, int size);
721
722 // Write size bytes at offset
write(const char * bufp,int64_t pos,int size)723 void write(const char* bufp, int64_t pos, int size) {
724 assert(isopen());
725 flush();
726 fseeko(out, pos, SEEK_SET);
727 write(bufp, size);
728 }
729
730 // Seek to pos. whence is SEEK_SET, SEEK_CUR, or SEEK_END
seek(int64_t pos,int whence)731 void seek(int64_t pos, int whence) {
732 assert(isopen());
733 flush();
734 fseeko(out, pos, whence);
735 }
736
737 // return position
tell()738 int64_t tell() {
739 assert(isopen());
740 return ftello(out)+ptr;
741 }
742
743 // Truncate file and move file pointer to end
truncate(int64_t newsize=0)744 void truncate(int64_t newsize=0) {
745 assert(isopen());
746 seek(newsize, SEEK_SET);
747 if (ftruncate(fileno(out), newsize)) perror("ftruncate");
748 }
749
750 // Close file and set date if not 0. Set permissions if attr low byte is 'u'
close(int64_t date=0,int64_t attr=0)751 void close(int64_t date=0, int64_t attr=0) {
752 if (out) {
753 flush();
754 fclose(out);
755 }
756 out=0;
757 if (date>0) {
758 struct utimbuf ub;
759 ub.actime=time(NULL);
760 ub.modtime=unix_time(date);
761 utime(filename.c_str(), &ub);
762 }
763 if ((attr&255)=='u')
764 chmod(filename.c_str(), attr>>8);
765 }
766
~OutputFile()767 ~OutputFile() {close();}
768 };
769
770 #else // Windows
771
772 class InputFile: public File, public libzpaq::Reader {
773 HANDLE in; // input file handle
774 DWORD n; // buffer size
775 public:
InputFile()776 InputFile():
777 in(INVALID_HANDLE_VALUE), n(0) {}
778
779 // Open for reading. Return true if successful.
780 // Encrypt with aes+e if aes.
open(const char * filename,libzpaq::AES_CTR * a=0,int64_t e=0)781 bool open(const char* filename, libzpaq::AES_CTR* a=0, int64_t e=0) {
782 assert(in==INVALID_HANDLE_VALUE);
783 n=ptr=0;
784 std::wstring w=utow(filename, true);
785 in=CreateFile(w.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL,
786 OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
787 if (in==INVALID_HANDLE_VALUE) winError(filename);
788 aes=a;
789 eoff=e;
790 return in!=INVALID_HANDLE_VALUE;
791 }
792
isopen()793 bool isopen() {return in!=INVALID_HANDLE_VALUE;}
794
795 // Read 1 byte
get()796 int get() {
797 if (ptr>=int(n)) {
798 assert(ptr==int(n));
799 ptr=0;
800 ReadFile(in, &buf[0], BUFSIZE, &n, NULL);
801 if (n==0) return EOF;
802 if (aes) {
803 int64_t off=tell()+eoff;
804 if (off<32) error("attempt to read salt");
805 aes->encrypt(&buf[0], n, off);
806 }
807 }
808 assert(ptr<int(n));
809 return buf[ptr++]&255;
810 }
811
812 // set file pointer
seek(int64_t pos,int whence)813 void seek(int64_t pos, int whence) {
814 if (whence==SEEK_SET) whence=FILE_BEGIN;
815 else if (whence==SEEK_END) whence=FILE_END;
816 else if (whence==SEEK_CUR) {
817 whence=FILE_BEGIN;
818 pos+=tell();
819 }
820 LONG offhigh=pos>>32;
821 SetFilePointer(in, pos, &offhigh, whence);
822 n=ptr=0;
823 }
824
825 // get file pointer
tell()826 int64_t tell() {
827 LONG offhigh=0;
828 DWORD r=SetFilePointer(in, 0, &offhigh, FILE_CURRENT);
829 return (int64_t(offhigh)<<32)+r+ptr-n;
830 }
831
832 // Close handle if open
close()833 void close() {
834 if (in!=INVALID_HANDLE_VALUE) {
835 CloseHandle(in);
836 in=INVALID_HANDLE_VALUE;
837 }
838 }
~InputFile()839 ~InputFile() {close();}
840 };
841
842 class OutputFile: public File, public libzpaq::Writer {
843 HANDLE out; // output file handle
844 std::wstring filename; // filename as wide string
845 public:
OutputFile()846 OutputFile(): out(INVALID_HANDLE_VALUE) {}
847
848 // Return true if file is open
isopen()849 bool isopen() {
850 return out!=INVALID_HANDLE_VALUE;
851 }
852
853 // Open file ready to update or append, create if needed.
854 // If aes then encrypt with aes+e.
open(const char * filename_,libzpaq::AES_CTR * a=0,int64_t e=0)855 bool open(const char* filename_, libzpaq::AES_CTR* a=0, int64_t e=0) {
856 assert(!isopen());
857 ptr=0;
858 filename=utow(filename_, true);
859 out=CreateFile(filename.c_str(), GENERIC_READ | GENERIC_WRITE,
860 0, NULL, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
861 if (out==INVALID_HANDLE_VALUE) winError(filename_);
862 else {
863 LONG hi=0;
864 aes=a;
865 eoff=e;
866 SetFilePointer(out, 0, &hi, FILE_END);
867 }
868 return isopen();
869 }
870
871 // Write pending output
flush()872 void flush() {
873 assert(isopen());
874 if (ptr) {
875 DWORD n=0;
876 if (aes) {
877 int64_t off=tell()-ptr+eoff;
878 if (off<32) error("attempt to overwrite salt");
879 aes->encrypt(&buf[0], ptr, off);
880 }
881 WriteFile(out, &buf[0], ptr, &n, NULL);
882 if (ptr!=int(n)) {
883 fprintf(stderr, "%s: error %d: wrote %d of %d bytes\n",
884 wtou(filename.c_str()).c_str(), int(GetLastError()),
885 int(n), ptr);
886 error("write failed");
887 }
888 ptr=0;
889 }
890 }
891
892 // Write 1 byte
put(int c)893 void put(int c) {
894 assert(isopen());
895 if (ptr>=BUFSIZE) {
896 assert(ptr==BUFSIZE);
897 flush();
898 }
899 buf[ptr++]=c;
900 }
901
902 // Write bufp[0..size-1]
903 void write(const char* bufp, int size);
904
905 // Write size bytes at offset
write(const char * bufp,int64_t pos,int size)906 void write(const char* bufp, int64_t pos, int size) {
907 assert(isopen());
908 flush();
909 if (pos!=tell()) seek(pos, SEEK_SET);
910 write(bufp, size);
911 }
912
913 // set file pointer
seek(int64_t pos,int whence)914 void seek(int64_t pos, int whence) {
915 if (whence==SEEK_SET) whence=FILE_BEGIN;
916 else if (whence==SEEK_CUR) whence=FILE_CURRENT;
917 else if (whence==SEEK_END) whence=FILE_END;
918 flush();
919 LONG offhigh=pos>>32;
920 SetFilePointer(out, pos, &offhigh, whence);
921 }
922
923 // get file pointer
tell()924 int64_t tell() {
925 LONG offhigh=0;
926 DWORD r=SetFilePointer(out, 0, &offhigh, FILE_CURRENT);
927 return (int64_t(offhigh)<<32)+r+ptr;
928 }
929
930 // Truncate file and move file pointer to end
truncate(int64_t newsize=0)931 void truncate(int64_t newsize=0) {
932 seek(newsize, SEEK_SET);
933 SetEndOfFile(out);
934 }
935
936 // Close file and set date if not 0. Set attr if low byte is 'w'.
close(int64_t date=0,int64_t attr=0)937 void close(int64_t date=0, int64_t attr=0) {
938 if (isopen()) {
939 flush();
940 setDate(out, date);
941 CloseHandle(out);
942 out=INVALID_HANDLE_VALUE;
943 if ((attr&255)=='w')
944 SetFileAttributes(filename.c_str(), attr>>8);
945 filename=L"";
946 }
947 }
~OutputFile()948 ~OutputFile() {close();}
949 };
950
951 #endif
952
953 // Write bufp[0..size-1]
write(const char * bufp,int size)954 void OutputFile::write(const char* bufp, int size) {
955 if (ptr==BUFSIZE) flush();
956 while (size>0) {
957 assert(ptr>=0 && ptr<BUFSIZE);
958 int n=BUFSIZE-ptr; // number of bytes to copy to buf
959 if (n>size) n=size;
960 memcpy(&buf[ptr], bufp, n);
961 size-=n;
962 bufp+=n;
963 ptr+=n;
964 if (ptr==BUFSIZE) flush();
965 }
966 }
967
968 // Count bytes written and discard them
969 struct Counter: public libzpaq::Writer {
970 int64_t pos; // count of written bytes
CounterCounter971 Counter(): pos(0) {}
putCounter972 void put(int c) {++pos;}
writeCounter973 void write(const char* bufp, int size) {pos+=size;}
974 };
975
976 /////////////////////////////// Archive ///////////////////////////////
977
978 // An Archive is a multi-part file that supports encrypted input
979 class Archive: public libzpaq::Reader, public libzpaq::Writer {
980 libzpaq::AES_CTR* aes; // NULL if not encrypted
981 struct FE { // File element for multi-part archives
982 string fn; // file name
983 int64_t end; // size of previous and current files
FEArchive::FE984 FE(): end(0) {}
FEArchive::FE985 FE(const string& s, int64_t e): fn(s), end(e) {}
986 };
987 vector<FE> files; // list of parts. only last part is writable.
988 int fi; // current file in files
989 int64_t off; // total offset over all files
990 int mode; // 'r' or 'w' for reading or writing or 0 if closed
991 InputFile in; // currently open input file
992 OutputFile out; // currently open output file
993 public:
994
995 // Constructor
Archive()996 Archive(): aes(0), fi(0), off(0), mode(0) {}
997
998 // Destructor
~Archive()999 ~Archive() {close();}
1000
1001 // Open filename for read and write. If filename contains wildards * or ?
1002 // then replace * with part number 1, 2, 3... or ? with single digits
1003 // up to the last existing file. Return true if at least one file is found.
1004 // If password is not NULL then assume the concatenation of the files
1005 // is in encrypted format. mode_ is 'r' for reading or 'w' for writing.
1006 // If the filename contains wildcards then output is to the first
1007 // non-existing file, else to filename. If newsize>=0 then truncate
1008 // the output to newsize bytes. If password and offset>0 then encrypt
1009 // output as if previous parts had size offset and salt salt.
1010 bool open(const char* filename, const char* password=0, int mode_='r',
1011 int64_t newsize=-1, int64_t offset=0, const char* salt=0);
1012
1013 // True if archive is open
isopen() const1014 bool isopen() const {return files.size()>0;}
1015
1016 // Position the next read or write offset to p.
1017 void seek(int64_t p, int whence);
1018
1019 // Return current file offset.
tell() const1020 int64_t tell() const {return off;}
1021
1022 // Read up to n bytes into buf at current offset. Return 0..n bytes
1023 // actually read. 0 indicates EOF.
read(char * buf,int n)1024 int read(char* buf, int n) {
1025 assert(mode=='r');
1026 if (fi>=size(files)) return 0;
1027 if (!in.isopen()) return 0;
1028 n=in.read(buf, n);
1029 seek(n, SEEK_CUR);
1030 return n;
1031 }
1032
1033 // Read and return 1 byte or -1 (EOF)
get()1034 int get() {
1035 assert(mode=='r');
1036 if (fi>=size(files)) return -1;
1037 while (off==files[fi].end) {
1038 in.close();
1039 if (++fi>=size(files)) return -1;
1040 if (!in.open(files[fi].fn.c_str(), aes, fi>0 ? files[fi-1].end : 0))
1041 error("cannot read next archive part");
1042 }
1043 ++off;
1044 return in.get();
1045 }
1046
1047 // Write one byte
put(int c)1048 void put(int c) {
1049 assert(fi==size(files)-1);
1050 assert(fi>0 || out.tell()==off);
1051 assert(fi==0 || out.tell()+files[fi-1].end==off);
1052 assert(mode=='w');
1053 out.put(c);
1054 ++off;
1055 }
1056
1057 // Write buf[0..n-1]
write(const char * buf,int n)1058 void write(const char* buf, int n) {
1059 assert(fi==size(files)-1);
1060 assert(fi>0 || out.tell()==off);
1061 assert(fi==0 || out.tell()+files[fi-1].end==off);
1062 assert(mode=='w');
1063 out.write(buf, n);
1064 off+=n;
1065 }
1066
1067 // Close any open part
close()1068 void close() {
1069 if (out.isopen()) out.close();
1070 if (in.isopen()) in.close();
1071 if (aes) {
1072 delete aes;
1073 aes=0;
1074 }
1075 files.clear();
1076 fi=0;
1077 off=0;
1078 mode=0;
1079 }
1080 };
1081
open(const char * filename,const char * password,int mode_,int64_t newsize,int64_t offset,const char * salt)1082 bool Archive::open(const char* filename, const char* password, int mode_,
1083 int64_t newsize, int64_t offset, const char* salt) {
1084 assert(filename);
1085 assert(mode_=='r' || mode_=='w');
1086 mode=mode_;
1087
1088 // Read part files and get sizes. Get salt from the first part.
1089 string next;
1090 for (int i=1; !offset; ++i) {
1091 next=subpart(filename, i);
1092 if (!exists(next)) break;
1093 if (files.size()>0 && files[0].fn==next) break; // part overflow
1094
1095 // set up key from salt in first file
1096 if (!in.open(next.c_str())) error("cannot read archive");
1097 if (i==1 && password && newsize!=0) {
1098 char slt[32], key[32];
1099 if (in.read(slt, 32)!=32) error("no salt");
1100 libzpaq::stretchKey(key, password, slt);
1101 aes=new libzpaq::AES_CTR(key, 32, slt);
1102 }
1103
1104 // Get file size
1105 in.seek(0, SEEK_END);
1106 files.push_back(FE(next,
1107 in.tell()+(files.size() ? files[files.size()-1].end : 0)));
1108 in.close();
1109 if (next==filename) break; // no wildcards
1110 }
1111
1112 // If offset is not 0 then use it for the part sizes and use
1113 // salt as the salt of the first part.
1114 if (offset>0) {
1115 files.push_back(FE("", offset));
1116 files.push_back(FE(filename, offset));
1117 if (password) {
1118 assert(salt);
1119 char key[32]={0};
1120 libzpaq::stretchKey(key, password, salt);
1121 aes=new libzpaq::AES_CTR(key, 32, salt);
1122 }
1123 }
1124
1125 // Open file for reading
1126 fi=files.size();
1127 if (mode=='r') {
1128 seek(32*(password!=0), SEEK_SET); // open first input file
1129 return files.size()>0;
1130 }
1131
1132 // Truncate, deleting extra parts
1133 if (newsize>=0) {
1134 while (files.size()>0 && files.back().end>newsize) {
1135 if (newsize==0 || (files.size()>1 &&
1136 files[files.size()-2].end>=newsize)) {
1137 if (quiet<MAX_QUIET) {
1138 printUTF8(files.back().fn.c_str(), con);
1139 fprintf(con, " deleted.\n");
1140 }
1141 next=files.back().fn.c_str();
1142 delete_file(files.back().fn.c_str());
1143 files.pop_back();
1144 }
1145 else if (files.size()>0) {
1146 if (!out.open(files.back().fn.c_str()))
1147 error("cannot open archive part to truncate");
1148 int64_t newlen=newsize;
1149 if (files.size()>=2) newlen-=files[files.size()-2].end;
1150 if (quiet<MAX_QUIET) {
1151 printUTF8(files.back().fn.c_str(), con);
1152 fprintf(con, " truncated from %1.0f to %1.0f bytes.\n",
1153 out.tell()+0.0, newlen+0.0);
1154 }
1155 assert(newlen>=0);
1156 out.truncate(newlen);
1157 out.close();
1158 files.back().end=newsize;
1159 }
1160 }
1161 }
1162
1163 // Get name of part to write. If filename has wildcards then use
1164 // the next part number, else just filename.
1165 if (files.size()==0 || (next!=filename && next!=files[0].fn))
1166 files.push_back(FE(next, files.size() ? files.back().end : 0));
1167
1168 // Write salt for a new encrypted archive
1169 fi=files.size()-1;
1170 assert(fi>=0);
1171 if (password && !aes) {
1172 assert(fi==0);
1173 assert(files.size()==1);
1174 if (!out.open(files[fi].fn.c_str()))
1175 error("cannot write salt to archive");
1176 out.seek(0, SEEK_SET);
1177 char key[32]={0};
1178 char slt[32]={0};
1179 if (salt) memcpy(slt, salt, 32);
1180 else random(slt, 32);
1181 libzpaq::stretchKey(key, password, slt);
1182 aes=new libzpaq::AES_CTR(key, 32, slt);
1183 out.write(slt, 32);
1184 files[fi].end=out.tell(); // 32
1185 out.close();
1186 }
1187
1188 // Open for output
1189 assert(fi+1==size(files));
1190 assert(fi>=0);
1191 makepath(files[fi].fn.c_str());
1192 if (!out.open(files[fi].fn.c_str(), aes, fi>0 ? files[fi-1].end : 0))
1193 error("cannot open archive for output");
1194 off=files.back().end;
1195 assert(fi>0 || files[fi].end==out.tell());
1196 assert(fi==0 || files[fi].end==out.tell()+files[fi-1].end);
1197 if (quiet<MAX_QUIET) {
1198 fprintf(con, "Appending to ");
1199 printUTF8(files[fi].fn.c_str(), con);
1200 fprintf(con, " at offset %1.0f\n", out.tell()+0.0);
1201 }
1202 return true;
1203 }
1204
seek(int64_t p,int whence)1205 void Archive::seek(int64_t p, int whence) {
1206 if (whence==SEEK_SET) off=p;
1207 else if (whence==SEEK_CUR) off+=p;
1208 else if (whence==SEEK_END) off=(files.size() ? files.back().end : 0)+p;
1209 else assert(false);
1210 if (mode=='r') {
1211 int oldfi=fi;
1212 for (fi=0; fi<size(files) && off>=files[fi].end; ++fi);
1213 if (fi!=oldfi) {
1214 in.close();
1215 if (fi<size(files) && !in.open(files[fi].fn.c_str(), aes,
1216 fi>0 ? files[fi-1].end : 0))
1217 error("cannot reopen archive after seek");
1218 }
1219 if (fi<size(files)) in.seek(off-files[fi].end, SEEK_END);
1220 }
1221 else if (mode=='w') {
1222 assert(files.size()>0);
1223 assert(out.isopen());
1224 assert(fi+1==size(files));
1225 p=off;
1226 if (files.size()>=2) p-=files[files.size()-2].end;
1227 if (p<0) error("seek before start of output");
1228 out.seek(p, SEEK_SET);
1229 }
1230 }
1231
1232 ///////////////////////// NumberOfProcessors ///////////////////////////
1233
1234 // Guess number of cores. In 32 bit mode, max is 2.
numberOfProcessors()1235 int numberOfProcessors() {
1236 int rc=0; // result
1237 #ifdef unix
1238
1239 rc=(int)sysconf(_SC_NPROCESSORS_ONLN);
1240 #else
1241
1242 // In Windows return %NUMBER_OF_PROCESSORS%
1243 const char* p=getenv("NUMBER_OF_PROCESSORS");
1244 if (p) rc=atoi(p);
1245 #endif
1246 if (rc<1) rc=1;
1247 if (sizeof(char*)==4 && rc>2) rc=2;
1248 return rc;
1249 }
1250
1251 ////////////////////////////// StringBuffer //////////////////////////
1252
1253 // For libzpaq output to a string
1254 struct StringWriter: public libzpaq::Writer {
1255 string s;
putStringWriter1256 void put(int c) {s+=char(c);}
1257 };
1258
1259 // WriteBuffer for memory efficient output buffering
1260 class WriteBuffer: public libzpaq::Writer {
1261 enum {BUFSIZE=(1<<19)-80}; // buffer size
1262 int wptr; // number of bytes in last buffer
1263 int limit; // max buffers
1264 vector<char*> v; // array of buffers
1265 WriteBuffer& operator=(const WriteBuffer&); // no assignment
1266 WriteBuffer(const WriteBuffer&); // no copy
1267 void grow(); // append a buffer
1268
1269 public:
WriteBuffer()1270 WriteBuffer(): wptr(BUFSIZE), limit(0x7fffffff) {}
1271
1272 // Number of bytes put
size() const1273 int64_t size() const {return int64_t(v.size())*BUFSIZE+wptr-BUFSIZE;}
1274
1275 // Set allocation limit
setLimit(size_t lim)1276 void setLimit(size_t lim) {limit=lim/BUFSIZE+1;}
1277
1278 // store n bytes from buf[0..n-1]
1279 void write(const char* buf, int n);
1280
1281 // store 1 byte
put(int c)1282 void put(int c) {
1283 if (wptr==BUFSIZE) grow();
1284 assert(v.size()>0);
1285 assert(wptr>=0 && wptr<BUFSIZE);
1286 v.back()[wptr++]=c;
1287 }
1288
1289 // write to out
1290 void save(libzpaq::Writer* out);
1291
1292 // Write n bytes at begin..begin+n-1 to out at offset off
1293 void save(OutputFile& out, int64_t off, int64_t begin, int64_t n);
1294
1295 // Return the SHA-1 of n bytes at begin..begin+n-1 to result[0..19]
1296 void sha1(char* result, int64_t begin, int64_t n);
1297
1298 // Free memory
1299 void reset();
~WriteBuffer()1300 ~WriteBuffer() {reset();}
1301 };
1302
1303 // Append a buffer
grow()1304 void WriteBuffer::grow() {
1305 assert(wptr==BUFSIZE);
1306 if (int(v.size())>=limit) error("WriteBuffer overflow");
1307 v.push_back((char*)malloc(BUFSIZE));
1308 if (!v.back()) error("WriteBuffer: out of memory");
1309 wptr=0;
1310 }
1311
1312 // store n bytes from buf[0..n-1]
write(const char * buf,int n)1313 void WriteBuffer::write(const char* buf, int n) {
1314 while (n>0) {
1315 assert(wptr>=0 && wptr<=BUFSIZE);
1316 if (wptr==BUFSIZE) grow();
1317 int n1=n;
1318 if (n1>BUFSIZE-wptr) n1=BUFSIZE-wptr;
1319 assert(n1>0 && n1<=BUFSIZE);
1320 memcpy(v.back()+wptr, buf, n1);
1321 wptr+=n1;
1322 n-=n1;
1323 buf+=n1;
1324 }
1325 }
1326
1327 // write to out
save(libzpaq::Writer * out)1328 void WriteBuffer::save(libzpaq::Writer* out) {
1329 if (!out) return;
1330 for (int i=0; i<int(v.size())-1; ++i)
1331 out->write(v[i], BUFSIZE);
1332 if (v.size())
1333 out->write(v.back(), wptr);
1334 }
1335
1336 // Write n bytes at begin..begin+n-1 to out at offset off..off+n-1
save(OutputFile & out,int64_t off,int64_t begin,int64_t n)1337 void WriteBuffer::save(OutputFile& out, int64_t off, int64_t begin,
1338 int64_t n) {
1339 assert(out.isopen());
1340 assert(off>=0);
1341 assert(begin>=0);
1342 assert(n>=0);
1343 assert(begin+n<=size());
1344
1345 // Trim leading and trailing zeros before writing
1346 for (int i=begin/BUFSIZE; i<int(v.size()); ++i) {
1347 assert(i>=0 && i<int(v.size()));
1348 int64_t b=begin-int64_t(i)*BUFSIZE;
1349 int64_t e=b+n;
1350 if (b<0) b=0;
1351 if (e>BUFSIZE) e=BUFSIZE;
1352 if (e<=0) break;
1353 int b1=b, e1=e;
1354 while (b1<e1 && v[i][b1]==0) ++b1;
1355 while (e1>b1 && v[i][e1-1]==0) --e1;
1356 if (b1-b<4096) b1=b;
1357 if (e-e1<4096) e1=e;
1358 if (e1>b1) out.write(v[i]+b1, off-begin+i*BUFSIZE+b1, e1-b1);
1359 }
1360 }
1361
1362 // Return the SHA-1 of n bytes at begin..begin+n-1 to result[0..19]
sha1(char * result,int64_t begin,int64_t n)1363 void WriteBuffer::sha1(char* result, int64_t begin, int64_t n) {
1364 if (!result) return;
1365 assert(begin>=0);
1366 assert(n>=0);
1367 assert(begin+n<=size());
1368 libzpaq::SHA1 s;
1369 for (int i=begin/BUFSIZE; i<int(v.size()); ++i) {
1370 assert(i>=0 && i<int(v.size()));
1371 int64_t b=begin-int64_t(i)*BUFSIZE;
1372 int64_t e=b+n;
1373 if (b<0) b=0;
1374 if (e>BUFSIZE) e=BUFSIZE;
1375 if (e<=0) break;
1376 while (b<e) s.put(v[i][b++]);
1377 }
1378 assert(uint64_t(n)==s.usize());
1379 memcpy(result, s.result(), 20);
1380 }
1381
1382 // Free memory
reset()1383 void WriteBuffer::reset() {
1384 while (v.size()>0) {
1385 if (v.back()) free(v.back());
1386 v.pop_back();
1387 }
1388 wptr=BUFSIZE;
1389 }
1390
1391 // For (de)compressing to/from a string. Writing appends bytes
1392 // which can be later read.
1393 class StringBuffer: public libzpaq::Reader, public libzpaq::Writer {
1394 unsigned char* p; // allocated memory, not NUL terminated, may be NULL
1395 size_t al; // number of bytes allocated, 0 iff p is NULL
1396 size_t wpos; // index of next byte to write, wpos <= al
1397 size_t rpos; // index of next byte to read, rpos < wpos or return EOF.
1398 size_t limit; // max size, default = -1
1399 const size_t init; // initial size on first use after reset
1400
1401 // Increase capacity to a without changing size
reserve(size_t a)1402 void reserve(size_t a) {
1403 assert(!al==!p);
1404 if (a<=al) return;
1405 unsigned char* q=0;
1406 if (a>0) q=(unsigned char*)(p ? realloc(p, a) : malloc(a));
1407 if (a>0 && !q) {
1408 fprintf(stderr, "StringBuffer realloc %1.0f to %1.0f at %p failed\n",
1409 double(al), double(a), p);
1410 error("Out of memory");
1411 }
1412 p=q;
1413 al=a;
1414 }
1415
1416 // Enlarge al to make room to write at least n bytes.
lengthen(unsigned n)1417 void lengthen(unsigned n) {
1418 assert(wpos<=al);
1419 if (wpos+n>limit) error("StringBuffer overflow");
1420 if (wpos+n<=al) return;
1421 size_t a=al;
1422 while (wpos+n>=a) a=a*2+init;
1423 reserve(a);
1424 }
1425
1426 // No assignment or copy
1427 void operator=(const StringBuffer&);
1428 StringBuffer(const StringBuffer&);
1429
1430 public:
1431
1432 // Direct access to data
data()1433 unsigned char* data() {assert(p || wpos==0); return p;}
1434
1435 // Allocate no memory initially
StringBuffer(size_t n=0)1436 StringBuffer(size_t n=0):
1437 p(0), al(0), wpos(0), rpos(0), limit(size_t(-1)), init(n>128?n:128) {}
1438
1439 // Set output limit
setLimit(size_t n)1440 void setLimit(size_t n) {limit=n;}
1441
1442 // Free memory
~StringBuffer()1443 ~StringBuffer() {if (p) free(p);}
1444
1445 // Return number of bytes written.
size() const1446 size_t size() const {return wpos;}
1447
1448 // Return number of bytes left to read
remaining() const1449 size_t remaining() const {return wpos-rpos;}
1450
1451 // Reset size to 0.
reset()1452 void reset() {
1453 if (p) free(p);
1454 p=0;
1455 al=rpos=wpos=0;
1456 }
1457
1458 // Write a single byte.
put(int c)1459 void put(int c) { // write 1 byte
1460 lengthen(1);
1461 assert(p);
1462 assert(wpos<al);
1463 p[wpos++]=c;
1464 assert(wpos<=al);
1465 }
1466
1467 // Write buf[0..n-1]
write(const char * buf,int n)1468 void write(const char* buf, int n) {
1469 assert(buf);
1470 if (n<1) return;
1471 lengthen(n);
1472 assert(p);
1473 assert(wpos+n<=al);
1474 memcpy(p+wpos, buf, n);
1475 wpos+=n;
1476 }
1477
1478 // Read a single byte. Return EOF (-1) and reset at end of string.
get()1479 int get() {
1480 assert(rpos<=wpos);
1481 assert(rpos==wpos || p);
1482 return rpos<wpos ? p[rpos++] : (reset(),-1);
1483 }
1484
1485 // Read up to n bytes into buf[0..] or fewer if EOF is first.
1486 // Return the number of bytes actually read.
read(char * buf,int n)1487 int read(char* buf, int n) {
1488 assert(rpos<=wpos);
1489 assert(wpos<=al);
1490 assert(!al==!p);
1491 if (rpos+n>wpos) n=wpos-rpos;
1492 if (n>0) memcpy(buf, p+rpos, n);
1493 rpos+=n;
1494 return n;
1495 }
1496
1497 // Return the entire string as a read-only array.
c_str() const1498 const char* c_str() const {return (const char*)p;}
1499
1500 // Truncate the string to size i.
resize(size_t i)1501 void resize(size_t i) {wpos=i;}
1502
1503 // Write a string.
operator +=(const string & t)1504 void operator+=(const string& t) {write(t.data(), t.size());}
1505
1506 // Swap efficiently (init is not swapped)
swap(StringBuffer & s)1507 void swap(StringBuffer& s) {
1508 std::swap(p, s.p);
1509 std::swap(al, s.al);
1510 std::swap(wpos, s.wpos);
1511 std::swap(rpos, s.rpos);
1512 std::swap(limit, s.limit);
1513 }
1514 };
1515
1516 ////////////////////////////// misc ///////////////////////////////////
1517
1518 // In Windows convert upper case to lower case.
tolowerW(int c)1519 inline int tolowerW(int c) {
1520 #ifndef unix
1521 if (c>='A' && c<='Z') return c-'A'+'a';
1522 #endif
1523 return c;
1524 }
1525
1526 // Return true if strings a == b or a+"/" is a prefix of b
1527 // or a ends in "/" and is a prefix of b.
1528 // Match ? in a to any char in b.
1529 // Match * in a to any string in b.
1530 // In Windows, not case sensitive.
ispath(const char * a,const char * b)1531 bool ispath(const char* a, const char* b) {
1532 for (; *a; ++a, ++b) {
1533 const int ca=tolowerW(*a);
1534 const int cb=tolowerW(*b);
1535 if (ca=='*') {
1536 while (true) {
1537 if (ispath(a+1, b)) return true;
1538 if (!*b) return false;
1539 ++b;
1540 }
1541 }
1542 else if (ca=='?') {
1543 if (*b==0) return false;
1544 }
1545 else if (ca==cb && ca=='/' && a[1]==0)
1546 return true;
1547 else if (ca!=cb)
1548 return false;
1549 }
1550 return *b==0 || *b=='/';
1551 }
1552
1553 // Return true if Windows attributes encoded in attr (as in DT)
1554 // matches the specification in s (as in -not option). s is a string
1555 // like ":+da-shr+i" which means return true if the directory, archive
1556 // and index attributes are set and system, hidden, and readonly are clear.
1557 // To match, attr must be 'w' in the LSB followed by Windows attribute
1558 // bits d,a,s,h,r,i in 4,5,2,1,0,13 in the next higher 16 bits.
1559 // In Linux the attributes are 'u' in the LSB followed by d,r,w,x
1560 // (bits 14,8,7,6) for directory and user read, write, and execute
1561 // permission (regardless of owner).
isattr(const char * s,int64_t attr)1562 bool isattr(const char* s, int64_t attr) {
1563 if (!s || *s!=':') return false;
1564 int mode=0;
1565 for (++s; *s; ++s) {
1566 int bit=-1;
1567 if ((attr&255)=='w') { // Windows attributes
1568 switch(*s) {
1569 case '+':
1570 case '-':
1571 mode=*s; break;
1572 case 'd': bit=4; break;
1573 case 'A':
1574 case 'a': bit=5; break;
1575 case 's': bit=2; break;
1576 case 'h': bit=1; break;
1577 case 'r': bit=0; break;
1578 case 'i': bit=13; break;
1579 }
1580 }
1581 else if ((attr&255)=='u') { // Unix attributes
1582 switch(*s) {
1583 case '+':
1584 case '-':
1585 mode=*s; break;
1586 case 'd': bit=14; break;
1587 case 'r': bit=8; break;
1588 case 'w': bit=7; break;
1589 case 'x': bit=6; break;
1590 }
1591 }
1592 else
1593 return false;
1594 if (bit>=0) {
1595 bit=(attr>>(bit+8))&1;
1596 if (mode=='+' && !bit) return false;
1597 if (mode=='-' && bit) return false;
1598 }
1599 }
1600 return true;
1601 }
1602
1603 // Convert string to lower case
lowercase(string s)1604 string lowercase(string s) {
1605 for (unsigned i=0; i<s.size(); ++i)
1606 if (s[i]>='A' && s[i]<='Z') s[i]+='a'-'A';
1607 return s;
1608 }
1609
1610 // Read 4 byte little-endian int and advance s
btoi(const char * & s)1611 int btoi(const char* &s) {
1612 s+=4;
1613 return (s[-4]&255)|((s[-3]&255)<<8)|((s[-2]&255)<<16)|((s[-1]&255)<<24);
1614 }
1615
1616 // Read 8 byte little-endian int and advance s
btol(const char * & s)1617 int64_t btol(const char* &s) {
1618 int64_t r=unsigned(btoi(s));
1619 return r+(int64_t(btoi(s))<<32);
1620 }
1621
1622 // Convert x to 4 byte little-endian string
itob(unsigned x)1623 string itob(unsigned x) {
1624 string s(4, '\0');
1625 s[0]=x, s[1]=x>>8, s[2]=x>>16, s[3]=x>>24;
1626 return s;
1627 }
1628
1629 // convert to 8 byte little-endian string
ltob(int64_t x)1630 string ltob(int64_t x) {
1631 string s(8, '\0');
1632 s[0]=x, s[1]=x>>8, s[2]=x>>16, s[3]=x>>24;
1633 s[4]=x>>32, s[5]=x>>40, s[6]=x>>48, s[7]=x>>56;
1634 return s;
1635 }
1636
1637 // Convert decimal, octal (leading o) or hex (leading x) string to int
ntoi(const char * s)1638 int ntoi(const char* s) {
1639 int n=0, base=10, sign=1;
1640 for (; *s; ++s) {
1641 int c=*s;
1642 if (isupper(c)) c=tolower(c);
1643 if (!n && c=='x') base=16;
1644 else if (!n && c=='o') base=8;
1645 else if (!n && c=='-') sign=-1;
1646 else if (c>='0' && c<='9') n=n*base+c-'0';
1647 else if (base==16 && c>='a' && c<='f') n=n*base+c-'a'+10;
1648 else break;
1649 }
1650 return n*sign;
1651 }
1652
1653 /////////////////////////// read_password ////////////////////////////
1654
1655 // Read a password from argv[i+1..argc-1] or from the console without
1656 // echo (repeats times) if this sequence is empty. repeats can be 1 or 2.
1657 // If 2, require the same password to be entered twice in a row.
1658 // Advance i by the number of words in the password on the command
1659 // line, which will be 0 if the user is prompted.
1660 // Write the SHA-256 hash of the password in hash[0..31].
1661 // Return the length of the original password.
1662
read_password(char * hash,int repeats,int argc,const char ** argv,int & i)1663 int read_password(char* hash, int repeats,
1664 int argc, const char** argv, int& i) {
1665 assert(repeats==1 || repeats==2);
1666 libzpaq::SHA256 sha256;
1667 int result=0;
1668
1669 // Read password from argv[i+1..argc-1]
1670 if (i<argc-1 && argv[i+1][0]!='-') {
1671 while (true) { // read multi-word password with spaces between args
1672 ++i;
1673 for (const char* p=argv[i]; p && *p; ++p) sha256.put(*p);
1674 if (i<argc-1 && argv[i+1][0]!='-') sha256.put(' ');
1675 else break;
1676 }
1677 result=sha256.usize();
1678 memcpy(hash, sha256.result(), 32);
1679 return result;
1680 }
1681
1682 // Otherwise prompt user
1683 char oldhash[32]={0};
1684 if (repeats==2)
1685 fprintf(stderr, "Enter new password twice:\n");
1686 else {
1687 fprintf(stderr, "Password: ");
1688 fflush(stderr);
1689 }
1690 do {
1691
1692 // Read password without echo to end of line
1693 #if unix
1694 struct termios term, oldterm;
1695 FILE* in=fopen("/dev/tty", "r");
1696 if (!in) in=stdin;
1697 tcgetattr(fileno(in), &oldterm);
1698 memcpy(&term, &oldterm, sizeof(term));
1699 term.c_lflag&=~ECHO;
1700 term.c_lflag|=ECHONL;
1701 tcsetattr(fileno(in), TCSANOW, &term);
1702 char buf[256];
1703 if (!fgets(buf, 250, in)) return 0;
1704 tcsetattr(fileno(in), TCSANOW, &oldterm);
1705 if (in!=stdin) fclose(in);
1706 for (unsigned i=0; i<250 && buf[i]!=10 && buf[i]!=13 && buf[i]!=0; ++i)
1707 sha256.put(buf[i]);
1708 #else
1709 HANDLE h=GetStdHandle(STD_INPUT_HANDLE);
1710 DWORD mode=0, n=0;
1711 wchar_t buf[256];
1712 if (h!=INVALID_HANDLE_VALUE
1713 && GetConsoleMode(h, &mode)
1714 && SetConsoleMode(h, mode&~ENABLE_ECHO_INPUT)
1715 && ReadConsole(h, buf, 250, &n, NULL)) {
1716 SetConsoleMode(h, mode);
1717 fprintf(stderr, "\n");
1718 for (unsigned i=0; i<n && i<250 && buf[i]!=10 && buf[i]!=13; ++i)
1719 sha256.put(buf[i]);
1720 }
1721 else {
1722 fprintf(stderr, "Windows error %d\n", int(GetLastError()));
1723 error("Read password failed");
1724 }
1725 #endif
1726 result=sha256.usize();
1727 memcpy(oldhash, hash, 32);
1728 memcpy(hash, sha256.result(), 32);
1729 memset(buf, 0, sizeof(buf)); // clear sensitive data
1730 }
1731 while (repeats==2 && memcmp(oldhash, hash, 32));
1732 return result;
1733 }
1734
1735 /////////////////////////////// Jidac /////////////////////////////////
1736
1737 // A Jidac object represents an archive contents: a list of file
1738 // fragments with hash, size, and archive offset, and a list of
1739 // files with date, attributes, and list of fragment pointers.
1740 // Methods add to, extract from, compare, and list the archive.
1741
1742 // enum for HT::csize and version
1743 static const int64_t EXTRACTED= 0x7FFFFFFFFFFFFFFELL; // decompressed?
1744 static const int64_t HT_BAD= -0x7FFFFFFFFFFFFFFALL; // no such frag
1745 static const int64_t DEFAULT_VERSION=9999999999999LL; // unless -until
1746
1747 // fragment hash table entry
1748 struct HT {
1749 unsigned char sha1[20]; // fragment hash
1750 int usize; // uncompressed size, -1 if unknown
1751 int64_t csize; // if >=0 then block offset else -fragment number
HTHT1752 HT(const char* s=0, int u=-1, int64_t c=HT_BAD) {
1753 if (s) memcpy(sha1, s, 20);
1754 else memset(sha1, 0, 20);
1755 usize=u; csize=c;
1756 }
1757 };
1758
1759 // filename version entry
1760 struct DTV {
1761 int64_t date; // decimal YYYYMMDDHHMMSS (UT) or 0 if deleted
1762 int64_t size; // size or -1 if unknown
1763 int64_t attr; // first 8 attribute bytes
1764 double csize; // approximate compressed size
1765 vector<unsigned> ptr; // list of fragment indexes to HT
1766 int version; // which transaction was it added?
DTVDTV1767 DTV(): date(0), size(0), attr(0), csize(0), version(0) {}
1768 };
1769
1770 // filename entry
1771 struct DT {
1772 int64_t edate; // date of external file, 0=not found
1773 int64_t esize; // size of external file
1774 int64_t eattr; // external file attributes ('u' or 'w' in low byte)
1775 uint64_t sortkey; // determines sort order for compression
1776 vector<unsigned> eptr; // fragment list of external file to add
1777 vector<DTV> dtv; // list of versions
1778 int written; // 0..ptr.size() = fragments output. -1=ignore
DTDT1779 DT(): edate(0), esize(0), eattr(0), sortkey(0), written(-1) {}
1780 };
1781
1782 // Version info
1783 struct VER {
1784 int64_t date; // 0 if not JIDAC
1785 int64_t usize; // uncompressed size of files
1786 int64_t offset; // start of transaction
1787 int64_t csize; // size of compressed data, -1 = no index
1788 int updates; // file updates
1789 int deletes; // file deletions
1790 unsigned firstFragment;// first fragment ID
VERVER1791 VER() {memset(this, 0, sizeof(*this));}
1792 };
1793
1794 typedef map<string, DT> DTMap;
1795 class CompressJob;
1796
1797 // Do everything
1798 class Jidac {
1799 public:
1800 int doCommand(int argc, const char** argv);
1801 friend ThreadReturn decompressThread(void* arg);
1802 friend ThreadReturn testThread(void* arg);
1803 friend struct ExtractJob;
1804 private:
1805
1806 // Command line arguments
1807 string command; // "-add", "-extract", "-list", etc.
1808 string archive; // archive name
1809 vector<string> files; // list of files and directories to add
1810 vector<string> notfiles; // list of prefixes to exclude
1811 vector<string> tofiles; // files renamed with -to
1812 int64_t date; // now as decimal YYYYMMDDHHMMSS (UT)
1813 int64_t version; // version number or 14 digit date
1814 int threads; // default is number of cores
1815 int since; // First version to -list
1816 int summary; // Arg to -summary
1817 int fragment; // Log average fragment size in KB, default 6
1818 string method; // 0..9, default "1"
1819 bool force; // -force option
1820 bool all; // -all option
1821 bool noattributes; // -noattributes option
1822 bool nodelete; // -nodelete option
1823 bool duplicates; // -duplicates option
1824 bool resetArchive; // -not :-A option
1825 char password_string[32]; // hash of -key argument
1826 char new_password_string[32]; // hash of encrypt -to arg
1827 const char* password; // points to password_string or NULL
1828 const char* new_password; // points to new_password_string or NULL
1829 string with; // -with option
1830
1831 // Archive state
1832 int64_t dhsize; // total size of D blocks according to H blocks
1833 int64_t dcsize; // total size of D blocks according to C blocks
1834 vector<HT> ht; // list of fragments
1835 DTMap dt; // set of files
1836 vector<VER> ver; // version info
1837
1838 // Commands
1839 int add(); // add or delete, return 1 if error else 0
1840 int extract(); // extract or test, return 1 if error else 0
1841 void list(); // list
1842 int compare(); // compare, return 1 if differences else 0
1843 void purge(); // copy archive current version
1844 void usage(); // help
1845
1846 // Support functions
1847 string rename(const string& name); // replace files prefix with tofiles
1848 string unrename(const string& name); // undo rename
1849 int64_t read_archive(int *errors=0, const char* arc=0); // read arc
1850 void read_args(bool scan, bool mark_all=false); // read args, scan dirs
1851 void scandir(string filename, bool recurse=true); // scan dirs to dt
1852 void addfile(string filename, int64_t edate, int64_t esize,
1853 int64_t eattr); // add external file to dt
1854 void list_versions(int64_t csize); // print ver. csize=archive size
1855 bool equal(DTMap::const_iterator p); // compare to file
1856 };
1857
1858 // Print help message
usage()1859 void Jidac::usage() {
1860 fprintf(con,
1861 "zpaq archiver for incremental backups with rollback capability.\n"
1862 "(C) 2009-2014, Dell Inc. Free under GPL v3. http://mattmahoney.net/zpaq\n"
1863 #ifndef NDEBUG
1864 "DEBUG version\n"
1865 #endif
1866 "\n"
1867 "Usage: zpaq command archive[.zpaq] files... -options...\n"
1868 "Files can be directories in which case the whole tree is included.\n"
1869 "Wildcards * and ? in file names match any string or character.\n"
1870 "Wildcards in archive match numbers or digits in multi-part archive.\n"
1871 "Part 0 is the index. If present, no other parts are needed to update.\n"
1872 " a add Add changed files to archive (or to \"\" to test).\n"
1873 " x extract Decompress named files (default: entire contents).\n"
1874 " l list List named files (default: entire contents).\n"
1875 " c compare Compare with external files (default: entire contents).\n"
1876 " d delete Mark as deleted in a new version of archive.\n"
1877 " t test Extract and verify but discard output.\n"
1878 " p purge -to out[.zpaq] Create new archive with old versions removed.\n"
1879 "Options (may be abbreviated if not ambiguous):\n"
1880 "-all l,t: All versions. c: compare metadata too. p: keep all.\n"
1881 "-duplicates l: List by size and label identical files with =\n"
1882 "-force a: Add even if dates unchanged. x: overwrite output files.\n"
1883 "-fragile Don't save or verify checksums or recovery info.\n"
1884 "-fragment N a: Set dedup fragment size to 2^N KiB (default: 6).\n"
1885 "-key [password] Required if encrypted (default: prompt without echo).\n"
1886 "-method 0..5 a: Compres faster..better (default: 1)\n"
1887 "-method 0..5[B] Use 2^B MiB blocks (default: 04, 14, 26, 36, 46, 56).\n"
1888 "-method {x|s|i}B[,N2]...[{c|i|a|w|m|s|t|fF}[N1[,N2]...]]...\n"
1889 " x=journaling (default). s=streaming (no dedup). i=index (no data).\n"
1890 " N2: 0=no pre/post. 1,2=packed,byte LZ77. 3=BWT. 4..7=0..3 with E8E9.\n"
1891 " N3=LZ77 min match. N4=longer match to try first (0=none). 2^N5=search\n"
1892 " depth. 2^N6=hash table size (N6=B+21: suffix array). N7=lookahead.\n"
1893 " Context modeling defaults shown below:\n"
1894 " c0,0,0: context model. N1: 0=ICM, 1..256=CM max count. 1000..1256 halves\n"
1895 " memory. N2: 1..255=pos mod N2, 1000..1255=gap to last N2-1000 byte.\n"
1896 " N3...: order 0... context masks (0..255). 256..511=mask+byte LZ77\n"
1897 " parse state, >1000: gap of N3-1000 zeros.\n"
1898 " i: ISSE chain. N1=context order. N2...=order increment.\n"
1899 " a24,0,0: MATCH: N1=hash multiplier. N2=halve buffer. N3=halve hash tab.\n"
1900 " w1,65,26,223,20,0: Order 0..N1-1 word ISSE chain. A word is bytes\n"
1901 " N2..N2+N3-1 ANDed with N4, hash mulitpiler N5, memory halved by N6.\n"
1902 " m8,24: MIX all previous models, N1 context bits, learning rate N2.\n"
1903 " s8,32,255: SSE last model. N1 context bits, count range N2..N3.\n"
1904 " t8,24: MIX2 last 2 models, N1 context bits, learning rate N2.\n"
1905 " fF: use ZPAQL model in file F.cfg (see docs).\n"
1906 "-newkey [password] p: Set out.zpaq password (default: no encryption).\n"
1907 "-noattributes Ignore/don't save file attributes or permissions.\n"
1908 "-nodelete a: Do not mark unmatched files as deleted.\n"
1909 "-not files... Exclude, e.g. zpaq a backup c:/ -not c:/Windows *.obj\n"
1910 #ifdef unix
1911 "-not :+-drwx Exclude if user permissions are +set or -unset.\n"
1912 #else
1913 "-not :+-dashri Exclude if Windows attributes are +set or -unset.\n"
1914 "-not :-Ad If archive attribute is set then clear it before adding.\n"
1915 #endif
1916 "-quiet [N[k|m|g]] Don't show files smaller than N (default none).\n"
1917 "-since N x,c,l: Start at version N or -N from end (default: 1).\n"
1918 "-summary [N] l: List top N (20) files and types and a version table.\n"
1919 "-threads N a,x,t: Use N threads (default: %d cores detected).\n"
1920 "-to names... a,x,l,c: Rename external files or specify prefix, e.g.\n"
1921 " zpaq x backup file1 dir1 -to file2 dir2 (rename output).\n"
1922 " zpaq x backup -to tmp/ (extract all to tmp/all).\n"
1923 "-until N Revert to N'th update or -N from end (default: last).\n"
1924 "-until %s Set date and revert (UT, default time: 235959).\n"
1925 "-with archive[.zpaq] c: compare two archives.\n",
1926 threads, dateToString(date).c_str());
1927 exit(1);
1928 }
1929
1930 // Rename name by matching it to a prefix of files[i] and replacing
1931 // the prefix with tofiles[i]. If files but not tofiles is empty
1932 // then append prefix tofiles[0].
rename(const string & name)1933 string Jidac::rename(const string& name) {
1934 if (!files.size() && tofiles.size())
1935 return tofiles[0]+name;
1936 for (unsigned i=0; i<files.size() && i<tofiles.size(); ++i) {
1937 const unsigned len=files[i].size();
1938 if (name.size()>=len && name.substr(0, len)==files[i])
1939 return tofiles[i]+name.substr(files[i].size());
1940 }
1941 return name;
1942 }
1943
1944 // Rename name by matching it to a prefix of tofiles[i] and replacing
1945 // the prefix with files[i]. If files but not tofiles is empty and
1946 // prefix matches tofiles[0] then remove prefix.
unrename(const string & name)1947 string Jidac::unrename(const string& name) {
1948 if (!files.size() && tofiles.size() && name.size()>=tofiles[0].size()
1949 && tofiles[0]==name.substr(0, tofiles[0].size()))
1950 return name.substr(tofiles[0].size());
1951 for (unsigned i=0; i<files.size() && i<tofiles.size(); ++i) {
1952 const unsigned len=tofiles[i].size();
1953 if (name.size()>=len && name.substr(0, len)==tofiles[i])
1954 return files[i]+name.substr(tofiles[i].size());
1955 }
1956 return name;
1957 }
1958
1959 // Expand an abbreviated option (with or without a leading "-")
1960 // or report error if not exactly 1 match. Always expand commands.
expandOption(const char * opt)1961 string expandOption(const char* opt) {
1962 const char* opts[]={
1963 "list","add","extract","delete","test","compare","purge",
1964 "method","force","quiet","summary","since","noattributes","key",
1965 "to","not","version","until","threads","all","fragile","duplicates",
1966 "fragment","nodelete","newkey","with",0};
1967 assert(opt);
1968 if (opt[0]=='-') ++opt;
1969 const int n=strlen(opt);
1970 if (n==1 && opt[0]=='x') return "-extract";
1971 string result;
1972 for (unsigned i=0; opts[i]; ++i) {
1973 if (!strncmp(opt, opts[i], n)) {
1974 if (result!="")
1975 fprintf(stderr, "Ambiguous: %s\n", opt), exit(1);
1976 result=string("-")+opts[i];
1977 if (i<7 && result!="") return result;
1978 }
1979 }
1980 if (result=="")
1981 fprintf(stderr, "No such option: %s\n", opt), exit(1);
1982 return result;
1983 }
1984
1985 // Parse the command line. Return 1 if error else 0.
doCommand(int argc,const char ** argv)1986 int Jidac::doCommand(int argc, const char** argv) {
1987
1988 // initialize to default values
1989 command="";
1990 force=all=noattributes=nodelete=duplicates=resetArchive=false;
1991 since=0;
1992 summary=0;
1993 version=DEFAULT_VERSION;
1994 date=0;
1995 threads=0; // 0 = auto-detect
1996 fragment=6;
1997 password=0; // no password
1998 new_password=0; // no new password
1999 method=""; // 0..5
2000 ht.resize(1); // element 0 not used
2001 ver.resize(1); // version 0
2002 dhsize=dcsize=0;
2003
2004 // Get date
2005 time_t now=time(NULL);
2006 tm* t=gmtime(&now);
2007 date=(t->tm_year+1900)*10000000000LL+(t->tm_mon+1)*100000000LL
2008 +t->tm_mday*1000000+t->tm_hour*10000+t->tm_min*100+t->tm_sec;
2009
2010 // Get optional options
2011 for (int i=1; i<argc; ++i) {
2012 const string opt=expandOption(argv[i]); // read command
2013 if ((opt=="-add" || opt=="-extract" || opt=="-list"
2014 || opt=="-delete" || opt=="-compare" || opt=="-test"
2015 || opt=="-purge")
2016 && i<argc-1 && argv[i+1][0]!='-' && command=="") {
2017 archive=argv[++i];
2018 if (archive!="" && // Add .zpaq extension
2019 (size(archive)<5 || archive.substr(archive.size()-5)!=".zpaq"))
2020 archive+=".zpaq";
2021 command=opt;
2022 while (++i<argc && argv[i][0]!='-')
2023 files.push_back(argv[i]);
2024 --i;
2025 }
2026 else if (opt=="-quiet") { // read number followed by k, m, g
2027 quiet=MAX_QUIET;
2028 if (i<argc-1 && isdigit(argv[i+1][0])) {
2029 quiet=0;
2030 for (const char* p=argv[++i]; *p; ++p) {
2031 int c=tolower(*p);
2032 if (isdigit(c)) quiet=quiet*10+c-'0';
2033 else if (c=='k') quiet*=1000;
2034 else if (c=='m') quiet*=1000000;
2035 else if (c=='g') quiet*=1000000000;
2036 else break;
2037 }
2038 }
2039 }
2040 else if (opt=="-force") force=true;
2041 else if (opt=="-all") all=true;
2042 else if (opt=="-fragile") fragile=true;
2043 else if (opt=="-noattributes") noattributes=true;
2044 else if (opt=="-nodelete") nodelete=true;
2045 else if (opt=="-duplicates") duplicates=true;
2046 else if (opt=="-since" && i<argc-1) since=atoi(argv[++i]);
2047 else if (opt=="-fragment" && i<argc-1) fragment=atoi(argv[++i]);
2048 else if (opt=="-with" && i<argc-1) with=argv[++i];
2049 else if (opt=="-summary") {
2050 summary=20;
2051 if (i<argc-1 && isdigit(argv[i+1][0])) summary=atoi(argv[++i]);
2052 }
2053 else if (opt=="-threads" && i<argc-1) {
2054 threads=atoi(argv[++i]);
2055 if (threads<1) threads=1;
2056 }
2057 else if (opt=="-to") { // read tofiles. encrypt: read password
2058 while (++i<argc && argv[i][0]!='-')
2059 tofiles.push_back(argv[i]);
2060 --i;
2061 }
2062 else if (opt=="-not") { // read notfiles
2063 while (++i<argc && argv[i][0]!='-') {
2064 notfiles.push_back(argv[i]);
2065 if (argv[i][0]==':' && argv[i][1]=='-' && argv[i][2]=='A')
2066 resetArchive=true;
2067 }
2068 --i;
2069 }
2070 else if ((opt=="-version" || opt=="-until") && i+1<argc) { // read date
2071
2072 // Read digits from multiple args and fill in leading zeros
2073 version=0;
2074 int digits=0;
2075 if (argv[i+1][0]=='-') { // negative version
2076 version=atol(argv[i+1]);
2077 if (version>-1) usage();
2078 ++i;
2079 }
2080 else { // positive version or date
2081 while (++i<argc && argv[i][0]!='-') {
2082 for (int j=0; ; ++j) {
2083 if (isdigit(argv[i][j])) {
2084 version=version*10+argv[i][j]-'0';
2085 ++digits;
2086 }
2087 else {
2088 if (digits==1) version=version/10*100+version%10;
2089 digits=0;
2090 if (argv[i][j]==0) break;
2091 }
2092 }
2093 }
2094 --i;
2095 }
2096
2097 // Append default time
2098 if (version>=19000000LL && version<=29991231LL)
2099 version=version*100+23;
2100 if (version>=1900000000LL && version<=2999123123LL)
2101 version=version*100+59;
2102 if (version>=190000000000LL && version<=299912312359LL)
2103 version=version*100+59;
2104 if (version>9999999) {
2105 if (version<19000101000000LL || version>29991231235959LL) {
2106 fprintf(stderr,
2107 "Version date %1.0f must be 19000101000000 to 29991231235959\n",
2108 double(version));
2109 exit(1);
2110 }
2111 date=version;
2112 }
2113 }
2114 else if (opt=="-method" && i<argc-1)
2115 method=argv[++i];
2116 else if (opt=="-key") {
2117 if (read_password(password_string, 2-exists(archive, 1),
2118 argc, argv, i))
2119 password=password_string;
2120 }
2121 else if (opt=="-newkey") {
2122 if (read_password(new_password_string, 2, argc, argv, i))
2123 new_password=new_password_string;
2124 }
2125 else
2126 usage();
2127 }
2128
2129 // Set threads
2130 if (!threads)
2131 threads=numberOfProcessors();
2132
2133 // Test date
2134 if (now==-1 || date<19000000000000LL || date>30000000000000LL)
2135 error("date is incorrect, use -until YYYY-MM-DD HH:MM:SS to set");
2136
2137 // Set verbosity level
2138 if ((command=="-add" || command=="-delete" || command=="-test"
2139 || command=="-extract") && quiet==-1)
2140 quiet=MAX_QUIET-1;
2141
2142 // Adjust negative version
2143 if (version<0) {
2144 Jidac jidac(*this);
2145 jidac.version=DEFAULT_VERSION;
2146 if (!jidac.read_archive()) // not found?
2147 jidac.read_archive(0, subpart(archive, 0).c_str()); // try remote index
2148 version+=size(jidac.ver)-1;
2149 }
2150
2151 // Execute command
2152 if (quiet<MAX_QUIET)
2153 fprintf(con, "zpaq v" ZPAQ_VERSION " journaling archiver, compiled "
2154 __DATE__ "\n");
2155 if (size(files) && (command=="-add" || command=="-delete")) return add();
2156 else if (command=="-extract" || command=="-test") return extract();
2157 else if (command=="-list") list();
2158 else if (command=="-compare") return compare();
2159 else if (command=="-purge") purge();
2160 else usage();
2161 return 0;
2162 }
2163
2164 // Read arc (default: archive) up to -date into ht, dt, ver. Return place to
2165 // append. If errors is not NULL then set it to number of errors found.
read_archive(int * errors,const char * arc)2166 int64_t Jidac::read_archive(int *errors, const char* arc) {
2167 if (errors) *errors=0;
2168 dcsize=dhsize=0;
2169
2170 // Open archive or archive.zpaq. If not found then try the index of
2171 // a multi-part archive.
2172 if (!arc) arc=archive.c_str();
2173 Archive in;
2174 if (!in.open(arc, password)) {
2175 if (command!="-add") {
2176 printUTF8(arc, stderr);
2177 fprintf(stderr, " not found.\n");
2178 if (errors) ++*errors;
2179 }
2180 return 0;
2181 }
2182 if (quiet<MAX_QUIET) {
2183 printUTF8(arc, con);
2184 if (version==DEFAULT_VERSION)
2185 fprintf(con, ": ");
2186 else
2187 fprintf(con, " -until %1.0f: ", version+0.0);
2188 fflush(con);
2189 }
2190
2191 // Test password
2192 if (password) {
2193 char s[4]={0};
2194 const int nr=in.read(s, 4);
2195 if (nr>0 && memcmp(s, "7kSt", 4) && (memcmp(s, "zPQ", 3) || s[3]<1))
2196 error("password incorrect");
2197 in.seek(-nr, SEEK_CUR);
2198 }
2199
2200 // Scan archive contents
2201 string lastfile=arc; // last named file in streaming format
2202 if (size(lastfile)>5)
2203 lastfile=lastfile.substr(0, size(lastfile)-5); // drop .zpaq
2204 int64_t block_offset=32*(password!=0); // start of last block of any type
2205 int64_t data_offset=block_offset; // start of last block of d fragments
2206 int64_t segment_offset=block_offset; // start of last segment
2207 bool found_data=false; // exit if nothing found
2208 bool first=true; // first segment in archive?
2209 enum {NORMAL, ERR, RECOVER} pass=NORMAL; // recover ht from data blocks?
2210 StringBuffer os(32832); // decompressed block
2211 map<int64_t, double> compressionRatio; // block offset -> compression ratio
2212
2213 // Detect archive format and read the filenames, fragment sizes,
2214 // and hashes. In JIDAC format, these are in the index blocks, allowing
2215 // data to be skipped. Otherwise the whole archive is scanned to get
2216 // this information from the segment headers and trailers.
2217 bool done=false;
2218 while (!done) {
2219 try {
2220
2221 // If there is an error in the h blocks, scan a second time in RECOVER
2222 // mode to recover the redundant fragment data from the d blocks.
2223 libzpaq::Decompresser d;
2224 d.setInput(&in);
2225 if (d.findBlock())
2226 found_data=true;
2227 else if (pass==ERR) {
2228 segment_offset=block_offset=32*(password!=0);
2229 in.seek(block_offset, SEEK_SET);
2230 if (!d.findBlock()) break;
2231 pass=RECOVER;
2232 if (quiet<MAX_QUIET)
2233 fprintf(con, "Attempting to recover fragment tables...\n");
2234 }
2235 else
2236 break;
2237
2238 // Read the segments in the current block
2239 StringWriter filename, comment;
2240 int segs=0;
2241 while (d.findFilename(&filename)) {
2242 if (filename.s.size()) {
2243 for (unsigned i=0; i<filename.s.size(); ++i)
2244 if (filename.s[i]=='\\') filename.s[i]='/';
2245 lastfile=filename.s.c_str();
2246 }
2247 comment.s="";
2248 d.readComment(&comment);
2249 if (quiet<MAX_QUIET && pass!=NORMAL)
2250 fprintf(con, "Reading %s %s at %1.0f\n", filename.s.c_str(),
2251 comment.s.c_str(), double(block_offset));
2252 int64_t usize=0; // read uncompressed size from comment or -1
2253 int64_t fdate=0; // read date from filename or -1
2254 int64_t fattr=0; // read attributes from comment as wN or uN
2255 unsigned num=0; // read fragment ID from filename
2256 const char* p=comment.s.c_str();
2257 for (; isdigit(*p); ++p) // read size
2258 usize=usize*10+*p-'0';
2259 if (p==comment.s.c_str()) usize=-1; // size not found
2260 for (; *p && fdate<19000000000000LL; ++p) // read date
2261 if (isdigit(*p)) fdate=fdate*10+*p-'0';
2262 if (fdate<19000000000000LL || fdate>=30000000000000LL) fdate=-1;
2263
2264 // Read the comment attribute wN or uN where N is a number
2265 int attrchar=0;
2266 for (; true; ++p) {
2267 if (*p=='u' || *p=='w') {
2268 attrchar=*p;
2269 fattr=0;
2270 }
2271 else if (isdigit(*p) && (attrchar=='u' || attrchar=='w'))
2272 fattr=fattr*10+*p-'0';
2273 else if (attrchar) {
2274 fattr=fattr*256+attrchar;
2275 attrchar=0;
2276 }
2277 if (!*p) break;
2278 }
2279
2280 // Test for JIDAC format. Filename is jDC<fdate>[cdhi]<num>
2281 // and comment ends with " jDC\x01"
2282 if (comment.s.size()>=4
2283 && usize>=0
2284 && comment.s.substr(comment.s.size()-4)=="jDC\x01"
2285 && filename.s.size()==28
2286 && filename.s.substr(0, 3)=="jDC"
2287 && strchr("cdhi", filename.s[17])) {
2288
2289 // Read the date and number in the filename
2290 num=0;
2291 fdate=0;
2292 for (unsigned i=3; i<17 && isdigit(filename.s[i]); ++i)
2293 fdate=fdate*10+filename.s[i]-'0';
2294 for (unsigned i=18; i<filename.s.size() && isdigit(filename.s[i]);
2295 ++i)
2296 num=num*10+filename.s[i]-'0';
2297
2298 // Decompress the block. In recovery mode, only decompress
2299 // data blocks containing missing HT data.
2300 os.reset();
2301 os.setLimit(usize);
2302 d.setOutput(&os);
2303 libzpaq::SHA1 sha1;
2304 d.setSHA1(&sha1);
2305 if (pass!=RECOVER || (filename.s[17]=='d' && num>0 &&
2306 num<ht.size() && ht[num].csize==HT_BAD)) {
2307 d.decompress();
2308 char sha1result[21]={0};
2309 d.readSegmentEnd(sha1result);
2310 if (usize!=int64_t(sha1.usize())) {
2311 fprintf(stderr, "%s size should be %1.0f, is %1.0f\n",
2312 filename.s.c_str(), double(usize),
2313 double(sha1.usize()));
2314 error("incorrect block size");
2315 }
2316 if (sha1result[0] && memcmp(sha1result+1, sha1.result(), 20)) {
2317 fprintf(stderr, "%s checksum error\n", filename.s.c_str());
2318 error("bad checksum");
2319 }
2320 }
2321 else
2322 d.readSegmentEnd();
2323
2324 // Transaction header (type c).
2325 // If in the future then stop here, else read 8 byte data size
2326 // from input and jump over it.
2327 if (filename.s[17]=='c' && fdate>=19000000000000LL
2328 && fdate<30000000000000LL && pass!=RECOVER) {
2329 data_offset=in.tell()+1;
2330 bool isbreak=version<19000000000000LL ? size(ver)>version :
2331 version<fdate;
2332 int64_t jmp=0;
2333 if (!isbreak && os.size()==8) { // jump
2334 const char* s=os.c_str();
2335 jmp=btol(s);
2336 if (jmp<0) {
2337 fprintf(stderr, "Incomplete transaction ignored\n");
2338 isbreak=true;
2339 }
2340 else if (jmp>0) {
2341 dcsize+=jmp;
2342 in.seek(jmp, SEEK_CUR);
2343 }
2344 }
2345 if (os.size()!=8) {
2346 fprintf(stderr, "Bad JIDAC header size: %d\n", size(os));
2347 isbreak=true;
2348 if (*errors) ++*errors;
2349 }
2350 if (isbreak) {
2351 done=true;
2352 break;
2353 }
2354 ver.push_back(VER());
2355 ver.back().firstFragment=size(ht);
2356 ver.back().offset=block_offset;
2357 ver.back().date=fdate;
2358 ver.back().csize=jmp;
2359 }
2360
2361 // Fragment table (type h).
2362 // Contents is bsize[4] (sha1[20] usize[4])... for fragment N...
2363 // where bsize is the compressed block size.
2364 // Store in ht[].{sha1,usize}. Set ht[].csize to block offset
2365 // assuming N in ascending order.
2366 else if (filename.s[17]=='h' && num>0 && os.size()>=4
2367 && pass!=RECOVER) {
2368 const char* s=os.c_str();
2369 const unsigned bsize=btoi(s);
2370 dhsize+=bsize;
2371 assert(size(ver)>0);
2372 const unsigned n=(os.size()-4)/24;
2373 if (ht.size()>num) {
2374 fprintf(stderr,
2375 "Unordered fragment tables: expected >= %d found %1.0f\n",
2376 size(ht), double(num));
2377 pass=ERR;
2378 }
2379 double usum=0; // total uncompressed size
2380 for (unsigned i=0; i<n; ++i) {
2381 while (ht.size()<=num+i) ht.push_back(HT());
2382 memcpy(ht[num+i].sha1, s, 20);
2383 s+=20;
2384 if (ht[num+i].csize!=HT_BAD) error("duplicate fragment ID");
2385 usum+=ht[num+i].usize=btoi(s);
2386 ht[num+i].csize=i?-int(i):data_offset;
2387 }
2388 if (usum>0) compressionRatio[data_offset]=bsize/usum;
2389 data_offset+=bsize;
2390 }
2391
2392 // Index (type i)
2393 // Contents is: 0[8] filename 0 (deletion)
2394 // or: date[8] filename 0 na[4] attr[na] ni[4] ptr[ni][4]
2395 // Read into DT
2396 else if (filename.s[17]=='i' && pass!=RECOVER) {
2397 const bool islist=command=="-list" || command=="-compare";
2398 const char* s=os.c_str();
2399 const char* const end=s+os.size();
2400 while (s<=end-9) {
2401 const char* fp=s+8; // filename
2402 DT& dtr=dt[fp];
2403 dtr.dtv.push_back(DTV());
2404 DTV& dtv=dtr.dtv.back();
2405 dtv.version=size(ver)-1;
2406 dtv.date=btol(s);
2407 assert(size(ver)>0);
2408 if (dtv.date) ++ver.back().updates;
2409 else ++ver.back().deletes;
2410 s+=strlen(fp)+1; // skip filename
2411 if (dtv.date && s<=end-8) {
2412 const unsigned na=btoi(s);
2413 for (unsigned i=0; i<na && s<end; ++i, ++s) // read attr
2414 if (i<8) dtv.attr+=int64_t(*s&255)<<(i*8);
2415 if (noattributes) dtv.attr=0;
2416 if (s<=end-4) {
2417 const unsigned ni=btoi(s);
2418 dtv.ptr.resize(ni);
2419 for (unsigned i=0; i<ni && s<=end-4; ++i) { // read ptr
2420 const unsigned j=dtv.ptr[i]=btoi(s);
2421 if (j<1 || j>=ht.size()+(1<<24))
2422 error("bad fragment ID");
2423 while (j>=ht.size()) {
2424 pass=ERR;
2425 ht.push_back(HT());
2426 }
2427 dtv.size+=ht[j].usize;
2428 ver.back().usize+=ht[j].usize;
2429
2430 // Estimate compressed size
2431 if (islist) {
2432 unsigned k=j;
2433 if (ht[j].csize<0 && ht[j].csize!=HT_BAD)
2434 k+=ht[j].csize;
2435 if (k>0 && k<ht.size() && ht[k].csize!=HT_BAD
2436 && ht[k].csize>=0)
2437 dtv.csize+=compressionRatio[ht[k].csize]*ht[j].usize;
2438 }
2439 }
2440 }
2441 }
2442 }
2443 }
2444
2445 // Recover fragment sizes and hashes from data block
2446 else if (pass==RECOVER && filename.s[17]=='d' && num>0
2447 && num<ht.size()) {
2448 if (os.size()>=8 && ht[num].csize==HT_BAD) {
2449 const char* p=os.c_str()+os.size()-8;
2450 unsigned n=btoi(p); // first fragment == num or 0
2451 if (n==0) n=num;
2452 unsigned f=btoi(p); // number of fragments
2453 if (n!=num && quiet<MAX_QUIET)
2454 fprintf(con, "fragments %u-%u were moved to %u-%u\n",
2455 n, n+f-1, num, num+f-1);
2456 n=num;
2457 if (f && f*4+8<=os.size()) {
2458 if (quiet<MAX_QUIET)
2459 fprintf(con, "Recovering fragments %u-%u at %1.0f\n",
2460 n, n+f-1, double(block_offset));
2461 while (ht.size()<=n+f) ht.push_back(HT());
2462 p=os.c_str()+os.size()-8-4*f;
2463
2464 // read fragment sizes into ht[n..n+f-1].usize
2465 unsigned sum=0;
2466 for (unsigned i=0; i<f; ++i) {
2467 sum+=ht[n+i].usize=btoi(p);
2468 ht[n+i].csize=i ? -int(i) : block_offset;
2469 }
2470
2471 // Compute hashes
2472 if (sum+f*4+8==os.size()) {
2473 if (quiet<MAX_QUIET)
2474 fprintf(con, "Computing hashes for %d bytes\n", sum);
2475 libzpaq::SHA1 sha1;
2476 p=os.c_str();
2477 for (unsigned i=0; i<f; ++i) {
2478 for (int j=0; j<ht[n+i].usize; ++j) {
2479 assert(p<os.c_str()+os.size());
2480 sha1.put(*p++);
2481 }
2482 memcpy(ht[n+i].sha1, sha1.result(), 20);
2483 }
2484 assert(p==os.c_str()+sum);
2485 }
2486 }
2487 }
2488
2489 // Correct bad offsets
2490 assert(num>0 && num<ht.size());
2491 if (quiet<MAX_QUIET && ht[num].csize!=block_offset) {
2492 fprintf(con, "Changing block %d offset from %1.0f to %1.0f\n",
2493 num, double(ht[num].csize), double(block_offset));
2494 ht[num].csize=block_offset;
2495 }
2496 }
2497
2498 // Bad JIDAC block
2499 else if (pass!=RECOVER) {
2500 fprintf(stderr, "Bad JIDAC block ignored: %s %s\n",
2501 filename.s.c_str(), comment.s.c_str());
2502 if (errors) ++*errors;
2503 }
2504 }
2505
2506 // Streaming format
2507 else if (pass!=RECOVER) {
2508
2509 // If previous version is dated or does not exist, start a new one
2510 if (segs==0 && (size(ver)==1 || ver.back().date!=0)) {
2511 if (size(ver)>version) {
2512 done=true;
2513 break;
2514 }
2515 ver.push_back(VER());
2516 ver.back().firstFragment=size(ht);
2517 ver.back().offset=block_offset;
2518 ver.back().csize=-1;
2519 }
2520
2521 char sha1result[21]={0};
2522 d.readSegmentEnd(sha1result);
2523 DT& dtr=dt[lastfile];
2524 if (filename.s.size()>0 || first) {
2525 dtr.dtv.push_back(DTV());
2526 dtr.dtv.back().date=fdate;
2527 dtr.dtv.back().attr=noattributes?0:fattr;
2528 dtr.dtv.back().version=size(ver)-1;
2529 ++ver.back().updates;
2530 }
2531 assert(dtr.dtv.size()>0);
2532 dtr.dtv.back().ptr.push_back(size(ht));
2533 if (usize>=0 && dtr.dtv.back().size>=0) dtr.dtv.back().size+=usize;
2534 else dtr.dtv.back().size=-1;
2535 dtr.dtv.back().csize+=in.tell()-segment_offset;
2536 if (usize>=0) ver.back().usize+=usize;
2537 ht.push_back(HT(sha1result+1, usize>0x7fffffff ? -1 : usize,
2538 segs ? -segs : block_offset));
2539 assert(size(ver)>0);
2540 }
2541 ++segs;
2542 filename.s="";
2543 first=false;
2544 segment_offset=in.tell();
2545 } // end while findFilename
2546 if (!done) segment_offset=block_offset=in.tell();
2547 } // end try
2548 catch (std::exception& e) {
2549 block_offset=in.tell();
2550 fprintf(stderr, "Skipping block at %1.0f: %s\n", double(block_offset),
2551 e.what());
2552 if (errors) ++*errors;
2553 }
2554 } // end while !done
2555 if (in.tell()>32*(password!=0) && !found_data)
2556 error("archive contains no data");
2557 in.close();
2558
2559 // Recompute file sizes in recover mode
2560 if (pass==RECOVER) {
2561 fprintf(stderr, "Recomputing file sizes\n");
2562 for (DTMap::iterator p=dt.begin(); p!=dt.end(); ++p) {
2563 for (unsigned i=0; i<p->second.dtv.size(); ++i) {
2564 p->second.dtv[i].size=0;
2565 for (unsigned j=0; j<p->second.dtv[i].ptr.size(); ++j) {
2566 unsigned k=p->second.dtv[i].ptr[j];
2567 if (k>0 && k<ht.size())
2568 p->second.dtv[i].size+=ht[k].usize;
2569 }
2570 }
2571 }
2572 }
2573 if (quiet<MAX_QUIET)
2574 fprintf(con, "%d versions, %d files, %d fragments, %1.6f MB\n",
2575 size(ver)-1, size(dt), size(ht)-1, block_offset*0.000001);
2576 return block_offset;
2577 }
2578
2579 // Mark each file in dt that matches the command args
2580 // (in files[]) and not matched to -not (in notfiles[])
2581 // using written=0 for each match. Match all files in dt if no args
2582 // (files[] is empty). If mark_all is true, then mark deleted files too.
2583 // If scan is true then recursively scan external directories in args,
2584 // or all files in dt if no args, add to dt, and mark them.
read_args(bool scan,bool mark_all)2585 void Jidac::read_args(bool scan, bool mark_all) {
2586
2587 // Match to files[] except notfiles[] or match all if files[] is empty
2588 if (since<0) since+=ver.size();
2589 for (DTMap::iterator p=dt.begin(); p!=dt.end(); ++p) {
2590 if (p->second.dtv.size()<1) {
2591 fprintf(stderr, "Invalid index entry: %s\n", p->first.c_str());
2592 error("corrupted index");
2593 }
2594 bool matched=size(files)==0;
2595 for (int i=0; !matched && i<size(files); ++i)
2596 if (ispath(files[i].c_str(), p->first.c_str()))
2597 matched=true;
2598 for (int i=0; matched && i<size(notfiles); ++i) {
2599 if (ispath(notfiles[i].c_str(), p->first.c_str()))
2600 matched=false;
2601 if (isattr(notfiles[i].c_str(), p->second.dtv.back().attr))
2602 matched=false;
2603 }
2604 if (matched &&
2605 (mark_all || (p->second.dtv.size() && p->second.dtv.back().date
2606 && p->second.dtv.back().version>=since)))
2607 p->second.written=0;
2608 }
2609
2610 // Scan external files and directories, insert into dt and mark written=0
2611 if (scan)
2612 for (int i=0; i<size(files); ++i)
2613 scandir(rename(files[i]));
2614
2615 // If no args then scan all files
2616 if (scan && size(files)==0) {
2617 vector<string> v;
2618 for (DTMap::iterator p=dt.begin(); p!=dt.end(); ++p)
2619 if (mark_all || (p->second.dtv.size() && p->second.dtv.back().date))
2620 v.push_back(p->first);
2621 for (int i=0; i<size(v); ++i)
2622 scandir(rename(v[i]), false);
2623 }
2624 }
2625
2626 // Return the part of fn up to the last slash
path(const string & fn)2627 string path(const string& fn) {
2628 int n=0;
2629 for (int i=0; fn[i]; ++i)
2630 if (fn[i]=='/' || fn[i]=='\\') n=i+1;
2631 return fn.substr(0, n);
2632 }
2633
2634 // Insert filename (UTF-8 with "/") into dt unless in notfiles. If filename
2635 // is a directory and recurse is true then also insert its contents.
2636 // In Windows, filename might have wildcards like "file.*" or "dir/*"
scandir(string filename,bool recurse)2637 void Jidac::scandir(string filename, bool recurse) {
2638
2639 // Omit if in notfiles
2640 for (int i=0; i<size(notfiles); ++i)
2641 if (ispath(notfiles[i].c_str(), unrename(filename).c_str())) return;
2642
2643 #ifdef unix
2644
2645 // Add regular files and directories
2646 struct stat sb;
2647 if (filename!="" && filename[filename.size()-1]=='/')
2648 filename=filename.substr(0, filename.size()-1);
2649 if (!lstat(filename.c_str(), &sb)) {
2650 for (int i=0; i<size(notfiles); ++i) // Omit if in not attributes
2651 if (isattr(notfiles[i].c_str(), 'u'+(sb.st_mode<<8)))
2652 return;
2653 if (S_ISREG(sb.st_mode))
2654 addfile(filename, decimal_time(sb.st_mtime), sb.st_size,
2655 'u'+(sb.st_mode<<8));
2656
2657 // Traverse directory
2658 if (S_ISDIR(sb.st_mode)) {
2659 addfile(filename+"/", decimal_time(sb.st_mtime), 0,
2660 'u'+(sb.st_mode<<8));
2661 if (recurse) {
2662 DIR* dirp=opendir(filename.c_str());
2663 if (dirp) {
2664 for (dirent* dp=readdir(dirp); dp; dp=readdir(dirp)) {
2665 if (strcmp(".", dp->d_name) && strcmp("..", dp->d_name)) {
2666 string s=filename;
2667 int len=s.size();
2668 if (len>0 && s[len-1]!='/' && s[len-1]!='\\') s+="/";
2669 s+=dp->d_name;
2670 scandir(s);
2671 }
2672 }
2673 closedir(dirp);
2674 }
2675 else
2676 perror(filename.c_str());
2677 }
2678 }
2679 }
2680 else if (recurse || errno!=ENOENT)
2681 perror(filename.c_str());
2682
2683 #else // Windows: expand wildcards in filename
2684
2685 // Expand wildcards
2686 WIN32_FIND_DATA ffd;
2687 string t=filename;
2688 if (t.size()>0 && t[t.size()-1]=='/') {
2689 if (recurse) t+="*";
2690 else filename=t=t.substr(0, t.size()-1);
2691 }
2692 HANDLE h=FindFirstFile(utow(t.c_str(), true).c_str(), &ffd);
2693 if (h==INVALID_HANDLE_VALUE && (recurse ||
2694 (GetLastError()!=ERROR_FILE_NOT_FOUND &&
2695 GetLastError()!=ERROR_PATH_NOT_FOUND)))
2696 winError(t.c_str());
2697 while (h!=INVALID_HANDLE_VALUE) {
2698
2699 // For each file, get name, date, size, attributes
2700 SYSTEMTIME st;
2701 int64_t edate=0;
2702 if (FileTimeToSystemTime(&ffd.ftLastWriteTime, &st))
2703 edate=st.wYear*10000000000LL+st.wMonth*100000000LL+st.wDay*1000000
2704 +st.wHour*10000+st.wMinute*100+st.wSecond;
2705 const int64_t esize=ffd.nFileSizeLow+(int64_t(ffd.nFileSizeHigh)<<32);
2706 const int64_t eattr='w'+(int64_t(ffd.dwFileAttributes)<<8);
2707
2708 // Ignore links, the names "." and ".." or any path/name in notfiles
2709 t=wtou(ffd.cFileName);
2710 if (ffd.dwFileAttributes & FILE_ATTRIBUTE_REPARSE_POINT
2711 || t=="." || t=="..") edate=0; // don't add
2712 string fn=path(filename)+t;
2713 for (int i=0; edate && i<size(notfiles); ++i) {
2714 if (ispath(notfiles[i].c_str(), unrename(fn).c_str())) edate=0;
2715 if (isattr(notfiles[i].c_str(), eattr)) edate=0;
2716 }
2717
2718 // Save directory names with a trailing / and scan their contents
2719 // Otherwise, save plain files
2720 if (edate) {
2721 if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)
2722 fn+="/";
2723 addfile(fn, edate, esize, eattr);
2724 if (recurse && (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) {
2725 fn+="*";
2726 scandir(fn);
2727 }
2728 }
2729 if (!FindNextFile(h, &ffd)) {
2730 if (GetLastError()!=ERROR_NO_MORE_FILES)
2731 winError(fn.c_str());
2732 break;
2733 }
2734 }
2735 FindClose(h);
2736 #endif
2737 }
2738
2739 // Add external file and its date, size, and attributes to dt
addfile(string filename,int64_t edate,int64_t esize,int64_t eattr)2740 void Jidac::addfile(string filename, int64_t edate,
2741 int64_t esize, int64_t eattr) {
2742 DT& d=dt[unrename(filename)];
2743 d.edate=edate;
2744 d.esize=esize;
2745 d.eattr=noattributes?0:eattr;
2746 d.written=0;
2747 }
2748
2749 ////////////////////////// divsufsort ///////////////////////////////
2750
2751 /*
2752 * divsufsort.c for libdivsufsort-lite
2753 * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
2754 *
2755 * Permission is hereby granted, free of charge, to any person
2756 * obtaining a copy of this software and associated documentation
2757 * files (the "Software"), to deal in the Software without
2758 * restriction, including without limitation the rights to use,
2759 * copy, modify, merge, publish, distribute, sublicense, and/or sell
2760 * copies of the Software, and to permit persons to whom the
2761 * Software is furnished to do so, subject to the following
2762 * conditions:
2763 *
2764 * The above copyright notice and this permission notice shall be
2765 * included in all copies or substantial portions of the Software.
2766 *
2767 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
2768 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
2769 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
2770 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
2771 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
2772 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
2773 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
2774 * OTHER DEALINGS IN THE SOFTWARE.
2775 */
2776
2777 /*- Constants -*/
2778 #define INLINE __inline
2779 #if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1)
2780 # undef ALPHABET_SIZE
2781 #endif
2782 #if !defined(ALPHABET_SIZE)
2783 # define ALPHABET_SIZE (256)
2784 #endif
2785 #define BUCKET_A_SIZE (ALPHABET_SIZE)
2786 #define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE)
2787 #if defined(SS_INSERTIONSORT_THRESHOLD)
2788 # if SS_INSERTIONSORT_THRESHOLD < 1
2789 # undef SS_INSERTIONSORT_THRESHOLD
2790 # define SS_INSERTIONSORT_THRESHOLD (1)
2791 # endif
2792 #else
2793 # define SS_INSERTIONSORT_THRESHOLD (8)
2794 #endif
2795 #if defined(SS_BLOCKSIZE)
2796 # if SS_BLOCKSIZE < 0
2797 # undef SS_BLOCKSIZE
2798 # define SS_BLOCKSIZE (0)
2799 # elif 32768 <= SS_BLOCKSIZE
2800 # undef SS_BLOCKSIZE
2801 # define SS_BLOCKSIZE (32767)
2802 # endif
2803 #else
2804 # define SS_BLOCKSIZE (1024)
2805 #endif
2806 /* minstacksize = log(SS_BLOCKSIZE) / log(3) * 2 */
2807 #if SS_BLOCKSIZE == 0
2808 # define SS_MISORT_STACKSIZE (96)
2809 #elif SS_BLOCKSIZE <= 4096
2810 # define SS_MISORT_STACKSIZE (16)
2811 #else
2812 # define SS_MISORT_STACKSIZE (24)
2813 #endif
2814 #define SS_SMERGE_STACKSIZE (32)
2815 #define TR_INSERTIONSORT_THRESHOLD (8)
2816 #define TR_STACKSIZE (64)
2817
2818
2819 /*- Macros -*/
2820 #ifndef SWAP
2821 # define SWAP(_a, _b) do { t = (_a); (_a) = (_b); (_b) = t; } while(0)
2822 #endif /* SWAP */
2823 #ifndef MIN
2824 # define MIN(_a, _b) (((_a) < (_b)) ? (_a) : (_b))
2825 #endif /* MIN */
2826 #ifndef MAX
2827 # define MAX(_a, _b) (((_a) > (_b)) ? (_a) : (_b))
2828 #endif /* MAX */
2829 #define STACK_PUSH(_a, _b, _c, _d)\
2830 do {\
2831 assert(ssize < STACK_SIZE);\
2832 stack[ssize].a = (_a), stack[ssize].b = (_b),\
2833 stack[ssize].c = (_c), stack[ssize++].d = (_d);\
2834 } while(0)
2835 #define STACK_PUSH5(_a, _b, _c, _d, _e)\
2836 do {\
2837 assert(ssize < STACK_SIZE);\
2838 stack[ssize].a = (_a), stack[ssize].b = (_b),\
2839 stack[ssize].c = (_c), stack[ssize].d = (_d), stack[ssize++].e = (_e);\
2840 } while(0)
2841 #define STACK_POP(_a, _b, _c, _d)\
2842 do {\
2843 assert(0 <= ssize);\
2844 if(ssize == 0) { return; }\
2845 (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
2846 (_c) = stack[ssize].c, (_d) = stack[ssize].d;\
2847 } while(0)
2848 #define STACK_POP5(_a, _b, _c, _d, _e)\
2849 do {\
2850 assert(0 <= ssize);\
2851 if(ssize == 0) { return; }\
2852 (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
2853 (_c) = stack[ssize].c, (_d) = stack[ssize].d, (_e) = stack[ssize].e;\
2854 } while(0)
2855 #define BUCKET_A(_c0) bucket_A[(_c0)]
2856 #if ALPHABET_SIZE == 256
2857 #define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)])
2858 #define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)])
2859 #else
2860 #define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)])
2861 #define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)])
2862 #endif
2863
2864
2865 /*- Private Functions -*/
2866
2867 static const int lg_table[256]= {
2868 -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
2869 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
2870 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
2871 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
2872 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
2873 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
2874 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
2875 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
2876 };
2877
2878 #if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
2879
2880 static INLINE
2881 int
ss_ilg(int n)2882 ss_ilg(int n) {
2883 #if SS_BLOCKSIZE == 0
2884 return (n & 0xffff0000) ?
2885 ((n & 0xff000000) ?
2886 24 + lg_table[(n >> 24) & 0xff] :
2887 16 + lg_table[(n >> 16) & 0xff]) :
2888 ((n & 0x0000ff00) ?
2889 8 + lg_table[(n >> 8) & 0xff] :
2890 0 + lg_table[(n >> 0) & 0xff]);
2891 #elif SS_BLOCKSIZE < 256
2892 return lg_table[n];
2893 #else
2894 return (n & 0xff00) ?
2895 8 + lg_table[(n >> 8) & 0xff] :
2896 0 + lg_table[(n >> 0) & 0xff];
2897 #endif
2898 }
2899
2900 #endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */
2901
2902 #if SS_BLOCKSIZE != 0
2903
2904 static const int sqq_table[256] = {
2905 0, 16, 22, 27, 32, 35, 39, 42, 45, 48, 50, 53, 55, 57, 59, 61,
2906 64, 65, 67, 69, 71, 73, 75, 76, 78, 80, 81, 83, 84, 86, 87, 89,
2907 90, 91, 93, 94, 96, 97, 98, 99, 101, 102, 103, 104, 106, 107, 108, 109,
2908 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
2909 128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
2910 143, 144, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155,
2911 156, 157, 158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168,
2912 169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178, 179, 180,
2913 181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188, 189, 189, 190, 191,
2914 192, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201,
2915 202, 203, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 209, 210, 211, 211,
2916 212, 212, 213, 214, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220, 221,
2917 221, 222, 222, 223, 224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230,
2918 230, 231, 231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238,
2919 239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247,
2920 247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255
2921 };
2922
2923 static INLINE
2924 int
ss_isqrt(int x)2925 ss_isqrt(int x) {
2926 int y, e;
2927
2928 if(x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) { return SS_BLOCKSIZE; }
2929 e = (x & 0xffff0000) ?
2930 ((x & 0xff000000) ?
2931 24 + lg_table[(x >> 24) & 0xff] :
2932 16 + lg_table[(x >> 16) & 0xff]) :
2933 ((x & 0x0000ff00) ?
2934 8 + lg_table[(x >> 8) & 0xff] :
2935 0 + lg_table[(x >> 0) & 0xff]);
2936
2937 if(e >= 16) {
2938 y = sqq_table[x >> ((e - 6) - (e & 1))] << ((e >> 1) - 7);
2939 if(e >= 24) { y = (y + 1 + x / y) >> 1; }
2940 y = (y + 1 + x / y) >> 1;
2941 } else if(e >= 8) {
2942 y = (sqq_table[x >> ((e - 6) - (e & 1))] >> (7 - (e >> 1))) + 1;
2943 } else {
2944 return sqq_table[x] >> 4;
2945 }
2946
2947 return (x < (y * y)) ? y - 1 : y;
2948 }
2949
2950 #endif /* SS_BLOCKSIZE != 0 */
2951
2952
2953 /*---------------------------------------------------------------------------*/
2954
2955 /* Compares two suffixes. */
2956 static INLINE
2957 int
ss_compare(const unsigned char * T,const int * p1,const int * p2,int depth)2958 ss_compare(const unsigned char *T,
2959 const int *p1, const int *p2,
2960 int depth) {
2961 const unsigned char *U1, *U2, *U1n, *U2n;
2962
2963 for(U1 = T + depth + *p1,
2964 U2 = T + depth + *p2,
2965 U1n = T + *(p1 + 1) + 2,
2966 U2n = T + *(p2 + 1) + 2;
2967 (U1 < U1n) && (U2 < U2n) && (*U1 == *U2);
2968 ++U1, ++U2) {
2969 }
2970
2971 return U1 < U1n ?
2972 (U2 < U2n ? *U1 - *U2 : 1) :
2973 (U2 < U2n ? -1 : 0);
2974 }
2975
2976
2977 /*---------------------------------------------------------------------------*/
2978
2979 #if (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1)
2980
2981 /* Insertionsort for small size groups */
2982 static
2983 void
ss_insertionsort(const unsigned char * T,const int * PA,int * first,int * last,int depth)2984 ss_insertionsort(const unsigned char *T, const int *PA,
2985 int *first, int *last, int depth) {
2986 int *i, *j;
2987 int t;
2988 int r;
2989
2990 for(i = last - 2; first <= i; --i) {
2991 for(t = *i, j = i + 1; 0 < (r = ss_compare(T, PA + t, PA + *j, depth));) {
2992 do { *(j - 1) = *j; } while((++j < last) && (*j < 0));
2993 if(last <= j) { break; }
2994 }
2995 if(r == 0) { *j = ~*j; }
2996 *(j - 1) = t;
2997 }
2998 }
2999
3000 #endif /* (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) */
3001
3002
3003 /*---------------------------------------------------------------------------*/
3004
3005 #if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
3006
3007 static INLINE
3008 void
ss_fixdown(const unsigned char * Td,const int * PA,int * SA,int i,int size)3009 ss_fixdown(const unsigned char *Td, const int *PA,
3010 int *SA, int i, int size) {
3011 int j, k;
3012 int v;
3013 int c, d, e;
3014
3015 for(v = SA[i], c = Td[PA[v]]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) {
3016 d = Td[PA[SA[k = j++]]];
3017 if(d < (e = Td[PA[SA[j]]])) { k = j; d = e; }
3018 if(d <= c) { break; }
3019 }
3020 SA[i] = v;
3021 }
3022
3023 /* Simple top-down heapsort. */
3024 static
3025 void
ss_heapsort(const unsigned char * Td,const int * PA,int * SA,int size)3026 ss_heapsort(const unsigned char *Td, const int *PA, int *SA, int size) {
3027 int i, m;
3028 int t;
3029
3030 m = size;
3031 if((size % 2) == 0) {
3032 m--;
3033 if(Td[PA[SA[m / 2]]] < Td[PA[SA[m]]]) { SWAP(SA[m], SA[m / 2]); }
3034 }
3035
3036 for(i = m / 2 - 1; 0 <= i; --i) { ss_fixdown(Td, PA, SA, i, m); }
3037 if((size % 2) == 0) { SWAP(SA[0], SA[m]); ss_fixdown(Td, PA, SA, 0, m); }
3038 for(i = m - 1; 0 < i; --i) {
3039 t = SA[0], SA[0] = SA[i];
3040 ss_fixdown(Td, PA, SA, 0, i);
3041 SA[i] = t;
3042 }
3043 }
3044
3045
3046 /*---------------------------------------------------------------------------*/
3047
3048 /* Returns the median of three elements. */
3049 static INLINE
3050 int *
ss_median3(const unsigned char * Td,const int * PA,int * v1,int * v2,int * v3)3051 ss_median3(const unsigned char *Td, const int *PA,
3052 int *v1, int *v2, int *v3) {
3053 int *t;
3054 if(Td[PA[*v1]] > Td[PA[*v2]]) { SWAP(v1, v2); }
3055 if(Td[PA[*v2]] > Td[PA[*v3]]) {
3056 if(Td[PA[*v1]] > Td[PA[*v3]]) { return v1; }
3057 else { return v3; }
3058 }
3059 return v2;
3060 }
3061
3062 /* Returns the median of five elements. */
3063 static INLINE
3064 int *
ss_median5(const unsigned char * Td,const int * PA,int * v1,int * v2,int * v3,int * v4,int * v5)3065 ss_median5(const unsigned char *Td, const int *PA,
3066 int *v1, int *v2, int *v3, int *v4, int *v5) {
3067 int *t;
3068 if(Td[PA[*v2]] > Td[PA[*v3]]) { SWAP(v2, v3); }
3069 if(Td[PA[*v4]] > Td[PA[*v5]]) { SWAP(v4, v5); }
3070 if(Td[PA[*v2]] > Td[PA[*v4]]) { SWAP(v2, v4); SWAP(v3, v5); }
3071 if(Td[PA[*v1]] > Td[PA[*v3]]) { SWAP(v1, v3); }
3072 if(Td[PA[*v1]] > Td[PA[*v4]]) { SWAP(v1, v4); SWAP(v3, v5); }
3073 if(Td[PA[*v3]] > Td[PA[*v4]]) { return v4; }
3074 return v3;
3075 }
3076
3077 /* Returns the pivot element. */
3078 static INLINE
3079 int *
ss_pivot(const unsigned char * Td,const int * PA,int * first,int * last)3080 ss_pivot(const unsigned char *Td, const int *PA, int *first, int *last) {
3081 int *middle;
3082 int t;
3083
3084 t = last - first;
3085 middle = first + t / 2;
3086
3087 if(t <= 512) {
3088 if(t <= 32) {
3089 return ss_median3(Td, PA, first, middle, last - 1);
3090 } else {
3091 t >>= 2;
3092 return ss_median5(Td, PA, first, first + t, middle, last - 1 - t, last - 1);
3093 }
3094 }
3095 t >>= 3;
3096 first = ss_median3(Td, PA, first, first + t, first + (t << 1));
3097 middle = ss_median3(Td, PA, middle - t, middle, middle + t);
3098 last = ss_median3(Td, PA, last - 1 - (t << 1), last - 1 - t, last - 1);
3099 return ss_median3(Td, PA, first, middle, last);
3100 }
3101
3102
3103 /*---------------------------------------------------------------------------*/
3104
3105 /* Binary partition for substrings. */
3106 static INLINE
3107 int *
ss_partition(const int * PA,int * first,int * last,int depth)3108 ss_partition(const int *PA,
3109 int *first, int *last, int depth) {
3110 int *a, *b;
3111 int t;
3112 for(a = first - 1, b = last;;) {
3113 for(; (++a < b) && ((PA[*a] + depth) >= (PA[*a + 1] + 1));) { *a = ~*a; }
3114 for(; (a < --b) && ((PA[*b] + depth) < (PA[*b + 1] + 1));) { }
3115 if(b <= a) { break; }
3116 t = ~*b;
3117 *b = *a;
3118 *a = t;
3119 }
3120 if(first < a) { *first = ~*first; }
3121 return a;
3122 }
3123
3124 /* Multikey introsort for medium size groups. */
3125 static
3126 void
ss_mintrosort(const unsigned char * T,const int * PA,int * first,int * last,int depth)3127 ss_mintrosort(const unsigned char *T, const int *PA,
3128 int *first, int *last,
3129 int depth) {
3130 #define STACK_SIZE SS_MISORT_STACKSIZE
3131 struct { int *a, *b, c; int d; } stack[STACK_SIZE];
3132 const unsigned char *Td;
3133 int *a, *b, *c, *d, *e, *f;
3134 int s, t;
3135 int ssize;
3136 int limit;
3137 int v, x = 0;
3138
3139 for(ssize = 0, limit = ss_ilg(last - first);;) {
3140
3141 if((last - first) <= SS_INSERTIONSORT_THRESHOLD) {
3142 #if 1 < SS_INSERTIONSORT_THRESHOLD
3143 if(1 < (last - first)) { ss_insertionsort(T, PA, first, last, depth); }
3144 #endif
3145 STACK_POP(first, last, depth, limit);
3146 continue;
3147 }
3148
3149 Td = T + depth;
3150 if(limit-- == 0) { ss_heapsort(Td, PA, first, last - first); }
3151 if(limit < 0) {
3152 for(a = first + 1, v = Td[PA[*first]]; a < last; ++a) {
3153 if((x = Td[PA[*a]]) != v) {
3154 if(1 < (a - first)) { break; }
3155 v = x;
3156 first = a;
3157 }
3158 }
3159 if(Td[PA[*first] - 1] < v) {
3160 first = ss_partition(PA, first, a, depth);
3161 }
3162 if((a - first) <= (last - a)) {
3163 if(1 < (a - first)) {
3164 STACK_PUSH(a, last, depth, -1);
3165 last = a, depth += 1, limit = ss_ilg(a - first);
3166 } else {
3167 first = a, limit = -1;
3168 }
3169 } else {
3170 if(1 < (last - a)) {
3171 STACK_PUSH(first, a, depth + 1, ss_ilg(a - first));
3172 first = a, limit = -1;
3173 } else {
3174 last = a, depth += 1, limit = ss_ilg(a - first);
3175 }
3176 }
3177 continue;
3178 }
3179
3180 /* choose pivot */
3181 a = ss_pivot(Td, PA, first, last);
3182 v = Td[PA[*a]];
3183 SWAP(*first, *a);
3184
3185 /* partition */
3186 for(b = first; (++b < last) && ((x = Td[PA[*b]]) == v);) { }
3187 if(((a = b) < last) && (x < v)) {
3188 for(; (++b < last) && ((x = Td[PA[*b]]) <= v);) {
3189 if(x == v) { SWAP(*b, *a); ++a; }
3190 }
3191 }
3192 for(c = last; (b < --c) && ((x = Td[PA[*c]]) == v);) { }
3193 if((b < (d = c)) && (x > v)) {
3194 for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) {
3195 if(x == v) { SWAP(*c, *d); --d; }
3196 }
3197 }
3198 for(; b < c;) {
3199 SWAP(*b, *c);
3200 for(; (++b < c) && ((x = Td[PA[*b]]) <= v);) {
3201 if(x == v) { SWAP(*b, *a); ++a; }
3202 }
3203 for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) {
3204 if(x == v) { SWAP(*c, *d); --d; }
3205 }
3206 }
3207
3208 if(a <= d) {
3209 c = b - 1;
3210
3211 if((s = a - first) > (t = b - a)) { s = t; }
3212 for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
3213 if((s = d - c) > (t = last - d - 1)) { s = t; }
3214 for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
3215
3216 a = first + (b - a), c = last - (d - c);
3217 b = (v <= Td[PA[*a] - 1]) ? a : ss_partition(PA, a, c, depth);
3218
3219 if((a - first) <= (last - c)) {
3220 if((last - c) <= (c - b)) {
3221 STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
3222 STACK_PUSH(c, last, depth, limit);
3223 last = a;
3224 } else if((a - first) <= (c - b)) {
3225 STACK_PUSH(c, last, depth, limit);
3226 STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
3227 last = a;
3228 } else {
3229 STACK_PUSH(c, last, depth, limit);
3230 STACK_PUSH(first, a, depth, limit);
3231 first = b, last = c, depth += 1, limit = ss_ilg(c - b);
3232 }
3233 } else {
3234 if((a - first) <= (c - b)) {
3235 STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
3236 STACK_PUSH(first, a, depth, limit);
3237 first = c;
3238 } else if((last - c) <= (c - b)) {
3239 STACK_PUSH(first, a, depth, limit);
3240 STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
3241 first = c;
3242 } else {
3243 STACK_PUSH(first, a, depth, limit);
3244 STACK_PUSH(c, last, depth, limit);
3245 first = b, last = c, depth += 1, limit = ss_ilg(c - b);
3246 }
3247 }
3248 } else {
3249 limit += 1;
3250 if(Td[PA[*first] - 1] < v) {
3251 first = ss_partition(PA, first, last, depth);
3252 limit = ss_ilg(last - first);
3253 }
3254 depth += 1;
3255 }
3256 }
3257 #undef STACK_SIZE
3258 }
3259
3260 #endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */
3261
3262
3263 /*---------------------------------------------------------------------------*/
3264
3265 #if SS_BLOCKSIZE != 0
3266
3267 static INLINE
3268 void
ss_blockswap(int * a,int * b,int n)3269 ss_blockswap(int *a, int *b, int n) {
3270 int t;
3271 for(; 0 < n; --n, ++a, ++b) {
3272 t = *a, *a = *b, *b = t;
3273 }
3274 }
3275
3276 static INLINE
3277 void
ss_rotate(int * first,int * middle,int * last)3278 ss_rotate(int *first, int *middle, int *last) {
3279 int *a, *b, t;
3280 int l, r;
3281 l = middle - first, r = last - middle;
3282 for(; (0 < l) && (0 < r);) {
3283 if(l == r) { ss_blockswap(first, middle, l); break; }
3284 if(l < r) {
3285 a = last - 1, b = middle - 1;
3286 t = *a;
3287 do {
3288 *a-- = *b, *b-- = *a;
3289 if(b < first) {
3290 *a = t;
3291 last = a;
3292 if((r -= l + 1) <= l) { break; }
3293 a -= 1, b = middle - 1;
3294 t = *a;
3295 }
3296 } while(1);
3297 } else {
3298 a = first, b = middle;
3299 t = *a;
3300 do {
3301 *a++ = *b, *b++ = *a;
3302 if(last <= b) {
3303 *a = t;
3304 first = a + 1;
3305 if((l -= r + 1) <= r) { break; }
3306 a += 1, b = middle;
3307 t = *a;
3308 }
3309 } while(1);
3310 }
3311 }
3312 }
3313
3314
3315 /*---------------------------------------------------------------------------*/
3316
3317 static
3318 void
ss_inplacemerge(const unsigned char * T,const int * PA,int * first,int * middle,int * last,int depth)3319 ss_inplacemerge(const unsigned char *T, const int *PA,
3320 int *first, int *middle, int *last,
3321 int depth) {
3322 const int *p;
3323 int *a, *b;
3324 int len, half;
3325 int q, r;
3326 int x;
3327
3328 for(;;) {
3329 if(*(last - 1) < 0) { x = 1; p = PA + ~*(last - 1); }
3330 else { x = 0; p = PA + *(last - 1); }
3331 for(a = first, len = middle - first, half = len >> 1, r = -1;
3332 0 < len;
3333 len = half, half >>= 1) {
3334 b = a + half;
3335 q = ss_compare(T, PA + ((0 <= *b) ? *b : ~*b), p, depth);
3336 if(q < 0) {
3337 a = b + 1;
3338 half -= (len & 1) ^ 1;
3339 } else {
3340 r = q;
3341 }
3342 }
3343 if(a < middle) {
3344 if(r == 0) { *a = ~*a; }
3345 ss_rotate(a, middle, last);
3346 last -= middle - a;
3347 middle = a;
3348 if(first == middle) { break; }
3349 }
3350 --last;
3351 if(x != 0) { while(*--last < 0) { } }
3352 if(middle == last) { break; }
3353 }
3354 }
3355
3356
3357 /*---------------------------------------------------------------------------*/
3358
3359 /* Merge-forward with internal buffer. */
3360 static
3361 void
ss_mergeforward(const unsigned char * T,const int * PA,int * first,int * middle,int * last,int * buf,int depth)3362 ss_mergeforward(const unsigned char *T, const int *PA,
3363 int *first, int *middle, int *last,
3364 int *buf, int depth) {
3365 int *a, *b, *c, *bufend;
3366 int t;
3367 int r;
3368
3369 bufend = buf + (middle - first) - 1;
3370 ss_blockswap(buf, first, middle - first);
3371
3372 for(t = *(a = first), b = buf, c = middle;;) {
3373 r = ss_compare(T, PA + *b, PA + *c, depth);
3374 if(r < 0) {
3375 do {
3376 *a++ = *b;
3377 if(bufend <= b) { *bufend = t; return; }
3378 *b++ = *a;
3379 } while(*b < 0);
3380 } else if(r > 0) {
3381 do {
3382 *a++ = *c, *c++ = *a;
3383 if(last <= c) {
3384 while(b < bufend) { *a++ = *b, *b++ = *a; }
3385 *a = *b, *b = t;
3386 return;
3387 }
3388 } while(*c < 0);
3389 } else {
3390 *c = ~*c;
3391 do {
3392 *a++ = *b;
3393 if(bufend <= b) { *bufend = t; return; }
3394 *b++ = *a;
3395 } while(*b < 0);
3396
3397 do {
3398 *a++ = *c, *c++ = *a;
3399 if(last <= c) {
3400 while(b < bufend) { *a++ = *b, *b++ = *a; }
3401 *a = *b, *b = t;
3402 return;
3403 }
3404 } while(*c < 0);
3405 }
3406 }
3407 }
3408
3409 /* Merge-backward with internal buffer. */
3410 static
3411 void
ss_mergebackward(const unsigned char * T,const int * PA,int * first,int * middle,int * last,int * buf,int depth)3412 ss_mergebackward(const unsigned char *T, const int *PA,
3413 int *first, int *middle, int *last,
3414 int *buf, int depth) {
3415 const int *p1, *p2;
3416 int *a, *b, *c, *bufend;
3417 int t;
3418 int r;
3419 int x;
3420
3421 bufend = buf + (last - middle) - 1;
3422 ss_blockswap(buf, middle, last - middle);
3423
3424 x = 0;
3425 if(*bufend < 0) { p1 = PA + ~*bufend; x |= 1; }
3426 else { p1 = PA + *bufend; }
3427 if(*(middle - 1) < 0) { p2 = PA + ~*(middle - 1); x |= 2; }
3428 else { p2 = PA + *(middle - 1); }
3429 for(t = *(a = last - 1), b = bufend, c = middle - 1;;) {
3430 r = ss_compare(T, p1, p2, depth);
3431 if(0 < r) {
3432 if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; }
3433 *a-- = *b;
3434 if(b <= buf) { *buf = t; break; }
3435 *b-- = *a;
3436 if(*b < 0) { p1 = PA + ~*b; x |= 1; }
3437 else { p1 = PA + *b; }
3438 } else if(r < 0) {
3439 if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; }
3440 *a-- = *c, *c-- = *a;
3441 if(c < first) {
3442 while(buf < b) { *a-- = *b, *b-- = *a; }
3443 *a = *b, *b = t;
3444 break;
3445 }
3446 if(*c < 0) { p2 = PA + ~*c; x |= 2; }
3447 else { p2 = PA + *c; }
3448 } else {
3449 if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; }
3450 *a-- = ~*b;
3451 if(b <= buf) { *buf = t; break; }
3452 *b-- = *a;
3453 if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; }
3454 *a-- = *c, *c-- = *a;
3455 if(c < first) {
3456 while(buf < b) { *a-- = *b, *b-- = *a; }
3457 *a = *b, *b = t;
3458 break;
3459 }
3460 if(*b < 0) { p1 = PA + ~*b; x |= 1; }
3461 else { p1 = PA + *b; }
3462 if(*c < 0) { p2 = PA + ~*c; x |= 2; }
3463 else { p2 = PA + *c; }
3464 }
3465 }
3466 }
3467
3468 /* D&C based merge. */
3469 static
3470 void
ss_swapmerge(const unsigned char * T,const int * PA,int * first,int * middle,int * last,int * buf,int bufsize,int depth)3471 ss_swapmerge(const unsigned char *T, const int *PA,
3472 int *first, int *middle, int *last,
3473 int *buf, int bufsize, int depth) {
3474 #define STACK_SIZE SS_SMERGE_STACKSIZE
3475 #define GETIDX(a) ((0 <= (a)) ? (a) : (~(a)))
3476 #define MERGE_CHECK(a, b, c)\
3477 do {\
3478 if(((c) & 1) ||\
3479 (((c) & 2) && (ss_compare(T, PA + GETIDX(*((a) - 1)), PA + *(a), depth) == 0))) {\
3480 *(a) = ~*(a);\
3481 }\
3482 if(((c) & 4) && ((ss_compare(T, PA + GETIDX(*((b) - 1)), PA + *(b), depth) == 0))) {\
3483 *(b) = ~*(b);\
3484 }\
3485 } while(0)
3486 struct { int *a, *b, *c; int d; } stack[STACK_SIZE];
3487 int *l, *r, *lm, *rm;
3488 int m, len, half;
3489 int ssize;
3490 int check, next;
3491
3492 for(check = 0, ssize = 0;;) {
3493 if((last - middle) <= bufsize) {
3494 if((first < middle) && (middle < last)) {
3495 ss_mergebackward(T, PA, first, middle, last, buf, depth);
3496 }
3497 MERGE_CHECK(first, last, check);
3498 STACK_POP(first, middle, last, check);
3499 continue;
3500 }
3501
3502 if((middle - first) <= bufsize) {
3503 if(first < middle) {
3504 ss_mergeforward(T, PA, first, middle, last, buf, depth);
3505 }
3506 MERGE_CHECK(first, last, check);
3507 STACK_POP(first, middle, last, check);
3508 continue;
3509 }
3510
3511 for(m = 0, len = MIN(middle - first, last - middle), half = len >> 1;
3512 0 < len;
3513 len = half, half >>= 1) {
3514 if(ss_compare(T, PA + GETIDX(*(middle + m + half)),
3515 PA + GETIDX(*(middle - m - half - 1)), depth) < 0) {
3516 m += half + 1;
3517 half -= (len & 1) ^ 1;
3518 }
3519 }
3520
3521 if(0 < m) {
3522 lm = middle - m, rm = middle + m;
3523 ss_blockswap(lm, middle, m);
3524 l = r = middle, next = 0;
3525 if(rm < last) {
3526 if(*rm < 0) {
3527 *rm = ~*rm;
3528 if(first < lm) { for(; *--l < 0;) { } next |= 4; }
3529 next |= 1;
3530 } else if(first < lm) {
3531 for(; *r < 0; ++r) { }
3532 next |= 2;
3533 }
3534 }
3535
3536 if((l - first) <= (last - r)) {
3537 STACK_PUSH(r, rm, last, (next & 3) | (check & 4));
3538 middle = lm, last = l, check = (check & 3) | (next & 4);
3539 } else {
3540 if((next & 2) && (r == middle)) { next ^= 6; }
3541 STACK_PUSH(first, lm, l, (check & 3) | (next & 4));
3542 first = r, middle = rm, check = (next & 3) | (check & 4);
3543 }
3544 } else {
3545 if(ss_compare(T, PA + GETIDX(*(middle - 1)), PA + *middle, depth) == 0) {
3546 *middle = ~*middle;
3547 }
3548 MERGE_CHECK(first, last, check);
3549 STACK_POP(first, middle, last, check);
3550 }
3551 }
3552 #undef STACK_SIZE
3553 }
3554
3555 #endif /* SS_BLOCKSIZE != 0 */
3556
3557
3558 /*---------------------------------------------------------------------------*/
3559
3560 /* Substring sort */
3561 static
3562 void
sssort(const unsigned char * T,const int * PA,int * first,int * last,int * buf,int bufsize,int depth,int n,int lastsuffix)3563 sssort(const unsigned char *T, const int *PA,
3564 int *first, int *last,
3565 int *buf, int bufsize,
3566 int depth, int n, int lastsuffix) {
3567 int *a;
3568 #if SS_BLOCKSIZE != 0
3569 int *b, *middle, *curbuf;
3570 int j, k, curbufsize, limit;
3571 #endif
3572 int i;
3573
3574 if(lastsuffix != 0) { ++first; }
3575
3576 #if SS_BLOCKSIZE == 0
3577 ss_mintrosort(T, PA, first, last, depth);
3578 #else
3579 if((bufsize < SS_BLOCKSIZE) &&
3580 (bufsize < (last - first)) &&
3581 (bufsize < (limit = ss_isqrt(last - first)))) {
3582 if(SS_BLOCKSIZE < limit) { limit = SS_BLOCKSIZE; }
3583 buf = middle = last - limit, bufsize = limit;
3584 } else {
3585 middle = last, limit = 0;
3586 }
3587 for(a = first, i = 0; SS_BLOCKSIZE < (middle - a); a += SS_BLOCKSIZE, ++i) {
3588 #if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
3589 ss_mintrosort(T, PA, a, a + SS_BLOCKSIZE, depth);
3590 #elif 1 < SS_BLOCKSIZE
3591 ss_insertionsort(T, PA, a, a + SS_BLOCKSIZE, depth);
3592 #endif
3593 curbufsize = last - (a + SS_BLOCKSIZE);
3594 curbuf = a + SS_BLOCKSIZE;
3595 if(curbufsize <= bufsize) { curbufsize = bufsize, curbuf = buf; }
3596 for(b = a, k = SS_BLOCKSIZE, j = i; j & 1; b -= k, k <<= 1, j >>= 1) {
3597 ss_swapmerge(T, PA, b - k, b, b + k, curbuf, curbufsize, depth);
3598 }
3599 }
3600 #if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
3601 ss_mintrosort(T, PA, a, middle, depth);
3602 #elif 1 < SS_BLOCKSIZE
3603 ss_insertionsort(T, PA, a, middle, depth);
3604 #endif
3605 for(k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) {
3606 if(i & 1) {
3607 ss_swapmerge(T, PA, a - k, a, middle, buf, bufsize, depth);
3608 a -= k;
3609 }
3610 }
3611 if(limit != 0) {
3612 #if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
3613 ss_mintrosort(T, PA, middle, last, depth);
3614 #elif 1 < SS_BLOCKSIZE
3615 ss_insertionsort(T, PA, middle, last, depth);
3616 #endif
3617 ss_inplacemerge(T, PA, first, middle, last, depth);
3618 }
3619 #endif
3620
3621 if(lastsuffix != 0) {
3622 /* Insert last type B* suffix. */
3623 int PAi[2]; PAi[0] = PA[*(first - 1)], PAi[1] = n - 2;
3624 for(a = first, i = *(first - 1);
3625 (a < last) && ((*a < 0) || (0 < ss_compare(T, &(PAi[0]), PA + *a, depth)));
3626 ++a) {
3627 *(a - 1) = *a;
3628 }
3629 *(a - 1) = i;
3630 }
3631 }
3632
3633
3634 /*---------------------------------------------------------------------------*/
3635
3636 static INLINE
3637 int
tr_ilg(int n)3638 tr_ilg(int n) {
3639 return (n & 0xffff0000) ?
3640 ((n & 0xff000000) ?
3641 24 + lg_table[(n >> 24) & 0xff] :
3642 16 + lg_table[(n >> 16) & 0xff]) :
3643 ((n & 0x0000ff00) ?
3644 8 + lg_table[(n >> 8) & 0xff] :
3645 0 + lg_table[(n >> 0) & 0xff]);
3646 }
3647
3648
3649 /*---------------------------------------------------------------------------*/
3650
3651 /* Simple insertionsort for small size groups. */
3652 static
3653 void
tr_insertionsort(const int * ISAd,int * first,int * last)3654 tr_insertionsort(const int *ISAd, int *first, int *last) {
3655 int *a, *b;
3656 int t, r;
3657
3658 for(a = first + 1; a < last; ++a) {
3659 for(t = *a, b = a - 1; 0 > (r = ISAd[t] - ISAd[*b]);) {
3660 do { *(b + 1) = *b; } while((first <= --b) && (*b < 0));
3661 if(b < first) { break; }
3662 }
3663 if(r == 0) { *b = ~*b; }
3664 *(b + 1) = t;
3665 }
3666 }
3667
3668
3669 /*---------------------------------------------------------------------------*/
3670
3671 static INLINE
3672 void
tr_fixdown(const int * ISAd,int * SA,int i,int size)3673 tr_fixdown(const int *ISAd, int *SA, int i, int size) {
3674 int j, k;
3675 int v;
3676 int c, d, e;
3677
3678 for(v = SA[i], c = ISAd[v]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) {
3679 d = ISAd[SA[k = j++]];
3680 if(d < (e = ISAd[SA[j]])) { k = j; d = e; }
3681 if(d <= c) { break; }
3682 }
3683 SA[i] = v;
3684 }
3685
3686 /* Simple top-down heapsort. */
3687 static
3688 void
tr_heapsort(const int * ISAd,int * SA,int size)3689 tr_heapsort(const int *ISAd, int *SA, int size) {
3690 int i, m;
3691 int t;
3692
3693 m = size;
3694 if((size % 2) == 0) {
3695 m--;
3696 if(ISAd[SA[m / 2]] < ISAd[SA[m]]) { SWAP(SA[m], SA[m / 2]); }
3697 }
3698
3699 for(i = m / 2 - 1; 0 <= i; --i) { tr_fixdown(ISAd, SA, i, m); }
3700 if((size % 2) == 0) { SWAP(SA[0], SA[m]); tr_fixdown(ISAd, SA, 0, m); }
3701 for(i = m - 1; 0 < i; --i) {
3702 t = SA[0], SA[0] = SA[i];
3703 tr_fixdown(ISAd, SA, 0, i);
3704 SA[i] = t;
3705 }
3706 }
3707
3708
3709 /*---------------------------------------------------------------------------*/
3710
3711 /* Returns the median of three elements. */
3712 static INLINE
3713 int *
tr_median3(const int * ISAd,int * v1,int * v2,int * v3)3714 tr_median3(const int *ISAd, int *v1, int *v2, int *v3) {
3715 int *t;
3716 if(ISAd[*v1] > ISAd[*v2]) { SWAP(v1, v2); }
3717 if(ISAd[*v2] > ISAd[*v3]) {
3718 if(ISAd[*v1] > ISAd[*v3]) { return v1; }
3719 else { return v3; }
3720 }
3721 return v2;
3722 }
3723
3724 /* Returns the median of five elements. */
3725 static INLINE
3726 int *
tr_median5(const int * ISAd,int * v1,int * v2,int * v3,int * v4,int * v5)3727 tr_median5(const int *ISAd,
3728 int *v1, int *v2, int *v3, int *v4, int *v5) {
3729 int *t;
3730 if(ISAd[*v2] > ISAd[*v3]) { SWAP(v2, v3); }
3731 if(ISAd[*v4] > ISAd[*v5]) { SWAP(v4, v5); }
3732 if(ISAd[*v2] > ISAd[*v4]) { SWAP(v2, v4); SWAP(v3, v5); }
3733 if(ISAd[*v1] > ISAd[*v3]) { SWAP(v1, v3); }
3734 if(ISAd[*v1] > ISAd[*v4]) { SWAP(v1, v4); SWAP(v3, v5); }
3735 if(ISAd[*v3] > ISAd[*v4]) { return v4; }
3736 return v3;
3737 }
3738
3739 /* Returns the pivot element. */
3740 static INLINE
3741 int *
tr_pivot(const int * ISAd,int * first,int * last)3742 tr_pivot(const int *ISAd, int *first, int *last) {
3743 int *middle;
3744 int t;
3745
3746 t = last - first;
3747 middle = first + t / 2;
3748
3749 if(t <= 512) {
3750 if(t <= 32) {
3751 return tr_median3(ISAd, first, middle, last - 1);
3752 } else {
3753 t >>= 2;
3754 return tr_median5(ISAd, first, first + t, middle, last - 1 - t, last - 1);
3755 }
3756 }
3757 t >>= 3;
3758 first = tr_median3(ISAd, first, first + t, first + (t << 1));
3759 middle = tr_median3(ISAd, middle - t, middle, middle + t);
3760 last = tr_median3(ISAd, last - 1 - (t << 1), last - 1 - t, last - 1);
3761 return tr_median3(ISAd, first, middle, last);
3762 }
3763
3764
3765 /*---------------------------------------------------------------------------*/
3766
3767 typedef struct _trbudget_t trbudget_t;
3768 struct _trbudget_t {
3769 int chance;
3770 int remain;
3771 int incval;
3772 int count;
3773 };
3774
3775 static INLINE
3776 void
trbudget_init(trbudget_t * budget,int chance,int incval)3777 trbudget_init(trbudget_t *budget, int chance, int incval) {
3778 budget->chance = chance;
3779 budget->remain = budget->incval = incval;
3780 }
3781
3782 static INLINE
3783 int
trbudget_check(trbudget_t * budget,int size)3784 trbudget_check(trbudget_t *budget, int size) {
3785 if(size <= budget->remain) { budget->remain -= size; return 1; }
3786 if(budget->chance == 0) { budget->count += size; return 0; }
3787 budget->remain += budget->incval - size;
3788 budget->chance -= 1;
3789 return 1;
3790 }
3791
3792
3793 /*---------------------------------------------------------------------------*/
3794
3795 static INLINE
3796 void
tr_partition(const int * ISAd,int * first,int * middle,int * last,int ** pa,int ** pb,int v)3797 tr_partition(const int *ISAd,
3798 int *first, int *middle, int *last,
3799 int **pa, int **pb, int v) {
3800 int *a, *b, *c, *d, *e, *f;
3801 int t, s;
3802 int x = 0;
3803
3804 for(b = middle - 1; (++b < last) && ((x = ISAd[*b]) == v);) { }
3805 if(((a = b) < last) && (x < v)) {
3806 for(; (++b < last) && ((x = ISAd[*b]) <= v);) {
3807 if(x == v) { SWAP(*b, *a); ++a; }
3808 }
3809 }
3810 for(c = last; (b < --c) && ((x = ISAd[*c]) == v);) { }
3811 if((b < (d = c)) && (x > v)) {
3812 for(; (b < --c) && ((x = ISAd[*c]) >= v);) {
3813 if(x == v) { SWAP(*c, *d); --d; }
3814 }
3815 }
3816 for(; b < c;) {
3817 SWAP(*b, *c);
3818 for(; (++b < c) && ((x = ISAd[*b]) <= v);) {
3819 if(x == v) { SWAP(*b, *a); ++a; }
3820 }
3821 for(; (b < --c) && ((x = ISAd[*c]) >= v);) {
3822 if(x == v) { SWAP(*c, *d); --d; }
3823 }
3824 }
3825
3826 if(a <= d) {
3827 c = b - 1;
3828 if((s = a - first) > (t = b - a)) { s = t; }
3829 for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
3830 if((s = d - c) > (t = last - d - 1)) { s = t; }
3831 for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
3832 first += (b - a), last -= (d - c);
3833 }
3834 *pa = first, *pb = last;
3835 }
3836
3837 static
3838 void
tr_copy(int * ISA,const int * SA,int * first,int * a,int * b,int * last,int depth)3839 tr_copy(int *ISA, const int *SA,
3840 int *first, int *a, int *b, int *last,
3841 int depth) {
3842 /* sort suffixes of middle partition
3843 by using sorted order of suffixes of left and right partition. */
3844 int *c, *d, *e;
3845 int s, v;
3846
3847 v = b - SA - 1;
3848 for(c = first, d = a - 1; c <= d; ++c) {
3849 if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
3850 *++d = s;
3851 ISA[s] = d - SA;
3852 }
3853 }
3854 for(c = last - 1, e = d + 1, d = b; e < d; --c) {
3855 if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
3856 *--d = s;
3857 ISA[s] = d - SA;
3858 }
3859 }
3860 }
3861
3862 static
3863 void
tr_partialcopy(int * ISA,const int * SA,int * first,int * a,int * b,int * last,int depth)3864 tr_partialcopy(int *ISA, const int *SA,
3865 int *first, int *a, int *b, int *last,
3866 int depth) {
3867 int *c, *d, *e;
3868 int s, v;
3869 int rank, lastrank, newrank = -1;
3870
3871 v = b - SA - 1;
3872 lastrank = -1;
3873 for(c = first, d = a - 1; c <= d; ++c) {
3874 if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
3875 *++d = s;
3876 rank = ISA[s + depth];
3877 if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
3878 ISA[s] = newrank;
3879 }
3880 }
3881
3882 lastrank = -1;
3883 for(e = d; first <= e; --e) {
3884 rank = ISA[*e];
3885 if(lastrank != rank) { lastrank = rank; newrank = e - SA; }
3886 if(newrank != rank) { ISA[*e] = newrank; }
3887 }
3888
3889 lastrank = -1;
3890 for(c = last - 1, e = d + 1, d = b; e < d; --c) {
3891 if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
3892 *--d = s;
3893 rank = ISA[s + depth];
3894 if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
3895 ISA[s] = newrank;
3896 }
3897 }
3898 }
3899
3900 static
3901 void
tr_introsort(int * ISA,const int * ISAd,int * SA,int * first,int * last,trbudget_t * budget)3902 tr_introsort(int *ISA, const int *ISAd,
3903 int *SA, int *first, int *last,
3904 trbudget_t *budget) {
3905 #define STACK_SIZE TR_STACKSIZE
3906 struct { const int *a; int *b, *c; int d, e; }stack[STACK_SIZE];
3907 int *a, *b, *c;
3908 int t;
3909 int v, x = 0;
3910 int incr = ISAd - ISA;
3911 int limit, next;
3912 int ssize, trlink = -1;
3913
3914 for(ssize = 0, limit = tr_ilg(last - first);;) {
3915
3916 if(limit < 0) {
3917 if(limit == -1) {
3918 /* tandem repeat partition */
3919 tr_partition(ISAd - incr, first, first, last, &a, &b, last - SA - 1);
3920
3921 /* update ranks */
3922 if(a < last) {
3923 for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; }
3924 }
3925 if(b < last) {
3926 for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; }
3927 }
3928
3929 /* push */
3930 if(1 < (b - a)) {
3931 STACK_PUSH5(NULL, a, b, 0, 0);
3932 STACK_PUSH5(ISAd - incr, first, last, -2, trlink);
3933 trlink = ssize - 2;
3934 }
3935 if((a - first) <= (last - b)) {
3936 if(1 < (a - first)) {
3937 STACK_PUSH5(ISAd, b, last, tr_ilg(last - b), trlink);
3938 last = a, limit = tr_ilg(a - first);
3939 } else if(1 < (last - b)) {
3940 first = b, limit = tr_ilg(last - b);
3941 } else {
3942 STACK_POP5(ISAd, first, last, limit, trlink);
3943 }
3944 } else {
3945 if(1 < (last - b)) {
3946 STACK_PUSH5(ISAd, first, a, tr_ilg(a - first), trlink);
3947 first = b, limit = tr_ilg(last - b);
3948 } else if(1 < (a - first)) {
3949 last = a, limit = tr_ilg(a - first);
3950 } else {
3951 STACK_POP5(ISAd, first, last, limit, trlink);
3952 }
3953 }
3954 } else if(limit == -2) {
3955 /* tandem repeat copy */
3956 a = stack[--ssize].b, b = stack[ssize].c;
3957 if(stack[ssize].d == 0) {
3958 tr_copy(ISA, SA, first, a, b, last, ISAd - ISA);
3959 } else {
3960 if(0 <= trlink) { stack[trlink].d = -1; }
3961 tr_partialcopy(ISA, SA, first, a, b, last, ISAd - ISA);
3962 }
3963 STACK_POP5(ISAd, first, last, limit, trlink);
3964 } else {
3965 /* sorted partition */
3966 if(0 <= *first) {
3967 a = first;
3968 do { ISA[*a] = a - SA; } while((++a < last) && (0 <= *a));
3969 first = a;
3970 }
3971 if(first < last) {
3972 a = first; do { *a = ~*a; } while(*++a < 0);
3973 next = (ISA[*a] != ISAd[*a]) ? tr_ilg(a - first + 1) : -1;
3974 if(++a < last) { for(b = first, v = a - SA - 1; b < a; ++b) { ISA[*b] = v; } }
3975
3976 /* push */
3977 if(trbudget_check(budget, a - first)) {
3978 if((a - first) <= (last - a)) {
3979 STACK_PUSH5(ISAd, a, last, -3, trlink);
3980 ISAd += incr, last = a, limit = next;
3981 } else {
3982 if(1 < (last - a)) {
3983 STACK_PUSH5(ISAd + incr, first, a, next, trlink);
3984 first = a, limit = -3;
3985 } else {
3986 ISAd += incr, last = a, limit = next;
3987 }
3988 }
3989 } else {
3990 if(0 <= trlink) { stack[trlink].d = -1; }
3991 if(1 < (last - a)) {
3992 first = a, limit = -3;
3993 } else {
3994 STACK_POP5(ISAd, first, last, limit, trlink);
3995 }
3996 }
3997 } else {
3998 STACK_POP5(ISAd, first, last, limit, trlink);
3999 }
4000 }
4001 continue;
4002 }
4003
4004 if((last - first) <= TR_INSERTIONSORT_THRESHOLD) {
4005 tr_insertionsort(ISAd, first, last);
4006 limit = -3;
4007 continue;
4008 }
4009
4010 if(limit-- == 0) {
4011 tr_heapsort(ISAd, first, last - first);
4012 for(a = last - 1; first < a; a = b) {
4013 for(x = ISAd[*a], b = a - 1; (first <= b) && (ISAd[*b] == x); --b) { *b = ~*b; }
4014 }
4015 limit = -3;
4016 continue;
4017 }
4018
4019 /* choose pivot */
4020 a = tr_pivot(ISAd, first, last);
4021 SWAP(*first, *a);
4022 v = ISAd[*first];
4023
4024 /* partition */
4025 tr_partition(ISAd, first, first + 1, last, &a, &b, v);
4026 if((last - first) != (b - a)) {
4027 next = (ISA[*a] != v) ? tr_ilg(b - a) : -1;
4028
4029 /* update ranks */
4030 for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; }
4031 if(b < last) { for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } }
4032
4033 /* push */
4034 if((1 < (b - a)) && (trbudget_check(budget, b - a))) {
4035 if((a - first) <= (last - b)) {
4036 if((last - b) <= (b - a)) {
4037 if(1 < (a - first)) {
4038 STACK_PUSH5(ISAd + incr, a, b, next, trlink);
4039 STACK_PUSH5(ISAd, b, last, limit, trlink);
4040 last = a;
4041 } else if(1 < (last - b)) {
4042 STACK_PUSH5(ISAd + incr, a, b, next, trlink);
4043 first = b;
4044 } else {
4045 ISAd += incr, first = a, last = b, limit = next;
4046 }
4047 } else if((a - first) <= (b - a)) {
4048 if(1 < (a - first)) {
4049 STACK_PUSH5(ISAd, b, last, limit, trlink);
4050 STACK_PUSH5(ISAd + incr, a, b, next, trlink);
4051 last = a;
4052 } else {
4053 STACK_PUSH5(ISAd, b, last, limit, trlink);
4054 ISAd += incr, first = a, last = b, limit = next;
4055 }
4056 } else {
4057 STACK_PUSH5(ISAd, b, last, limit, trlink);
4058 STACK_PUSH5(ISAd, first, a, limit, trlink);
4059 ISAd += incr, first = a, last = b, limit = next;
4060 }
4061 } else {
4062 if((a - first) <= (b - a)) {
4063 if(1 < (last - b)) {
4064 STACK_PUSH5(ISAd + incr, a, b, next, trlink);
4065 STACK_PUSH5(ISAd, first, a, limit, trlink);
4066 first = b;
4067 } else if(1 < (a - first)) {
4068 STACK_PUSH5(ISAd + incr, a, b, next, trlink);
4069 last = a;
4070 } else {
4071 ISAd += incr, first = a, last = b, limit = next;
4072 }
4073 } else if((last - b) <= (b - a)) {
4074 if(1 < (last - b)) {
4075 STACK_PUSH5(ISAd, first, a, limit, trlink);
4076 STACK_PUSH5(ISAd + incr, a, b, next, trlink);
4077 first = b;
4078 } else {
4079 STACK_PUSH5(ISAd, first, a, limit, trlink);
4080 ISAd += incr, first = a, last = b, limit = next;
4081 }
4082 } else {
4083 STACK_PUSH5(ISAd, first, a, limit, trlink);
4084 STACK_PUSH5(ISAd, b, last, limit, trlink);
4085 ISAd += incr, first = a, last = b, limit = next;
4086 }
4087 }
4088 } else {
4089 if((1 < (b - a)) && (0 <= trlink)) { stack[trlink].d = -1; }
4090 if((a - first) <= (last - b)) {
4091 if(1 < (a - first)) {
4092 STACK_PUSH5(ISAd, b, last, limit, trlink);
4093 last = a;
4094 } else if(1 < (last - b)) {
4095 first = b;
4096 } else {
4097 STACK_POP5(ISAd, first, last, limit, trlink);
4098 }
4099 } else {
4100 if(1 < (last - b)) {
4101 STACK_PUSH5(ISAd, first, a, limit, trlink);
4102 first = b;
4103 } else if(1 < (a - first)) {
4104 last = a;
4105 } else {
4106 STACK_POP5(ISAd, first, last, limit, trlink);
4107 }
4108 }
4109 }
4110 } else {
4111 if(trbudget_check(budget, last - first)) {
4112 limit = tr_ilg(last - first), ISAd += incr;
4113 } else {
4114 if(0 <= trlink) { stack[trlink].d = -1; }
4115 STACK_POP5(ISAd, first, last, limit, trlink);
4116 }
4117 }
4118 }
4119 #undef STACK_SIZE
4120 }
4121
4122
4123
4124 /*---------------------------------------------------------------------------*/
4125
4126 /* Tandem repeat sort */
4127 static
4128 void
trsort(int * ISA,int * SA,int n,int depth)4129 trsort(int *ISA, int *SA, int n, int depth) {
4130 int *ISAd;
4131 int *first, *last;
4132 trbudget_t budget;
4133 int t, skip, unsorted;
4134
4135 trbudget_init(&budget, tr_ilg(n) * 2 / 3, n);
4136 /* trbudget_init(&budget, tr_ilg(n) * 3 / 4, n); */
4137 for(ISAd = ISA + depth; -n < *SA; ISAd += ISAd - ISA) {
4138 first = SA;
4139 skip = 0;
4140 unsorted = 0;
4141 do {
4142 if((t = *first) < 0) { first -= t; skip += t; }
4143 else {
4144 if(skip != 0) { *(first + skip) = skip; skip = 0; }
4145 last = SA + ISA[t] + 1;
4146 if(1 < (last - first)) {
4147 budget.count = 0;
4148 tr_introsort(ISA, ISAd, SA, first, last, &budget);
4149 if(budget.count != 0) { unsorted += budget.count; }
4150 else { skip = first - last; }
4151 } else if((last - first) == 1) {
4152 skip = -1;
4153 }
4154 first = last;
4155 }
4156 } while(first < (SA + n));
4157 if(skip != 0) { *(first + skip) = skip; }
4158 if(unsorted == 0) { break; }
4159 }
4160 }
4161
4162
4163 /*---------------------------------------------------------------------------*/
4164
4165 /* Sorts suffixes of type B*. */
4166 static
4167 int
sort_typeBstar(const unsigned char * T,int * SA,int * bucket_A,int * bucket_B,int n)4168 sort_typeBstar(const unsigned char *T, int *SA,
4169 int *bucket_A, int *bucket_B,
4170 int n) {
4171 int *PAb, *ISAb, *buf;
4172 #ifdef _OPENMP
4173 int *curbuf;
4174 int l;
4175 #endif
4176 int i, j, k, t, m, bufsize;
4177 int c0, c1;
4178 #ifdef _OPENMP
4179 int d0, d1;
4180 int tmp;
4181 #endif
4182
4183 /* Initialize bucket arrays. */
4184 for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; }
4185 for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; }
4186
4187 /* Count the number of occurrences of the first one or two characters of each
4188 type A, B and B* suffix. Moreover, store the beginning position of all
4189 type B* suffixes into the array SA. */
4190 for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) {
4191 /* type A suffix. */
4192 do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1));
4193 if(0 <= i) {
4194 /* type B* suffix. */
4195 ++BUCKET_BSTAR(c0, c1);
4196 SA[--m] = i;
4197 /* type B suffix. */
4198 for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) {
4199 ++BUCKET_B(c0, c1);
4200 }
4201 }
4202 }
4203 m = n - m;
4204 /*
4205 note:
4206 A type B* suffix is lexicographically smaller than a type B suffix that
4207 begins with the same first two characters.
4208 */
4209
4210 /* Calculate the index of start/end point of each bucket. */
4211 for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) {
4212 t = i + BUCKET_A(c0);
4213 BUCKET_A(c0) = i + j; /* start point */
4214 i = t + BUCKET_B(c0, c0);
4215 for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) {
4216 j += BUCKET_BSTAR(c0, c1);
4217 BUCKET_BSTAR(c0, c1) = j; /* end point */
4218 i += BUCKET_B(c0, c1);
4219 }
4220 }
4221
4222 if(0 < m) {
4223 /* Sort the type B* suffixes by their first two characters. */
4224 PAb = SA + n - m; ISAb = SA + m;
4225 for(i = m - 2; 0 <= i; --i) {
4226 t = PAb[i], c0 = T[t], c1 = T[t + 1];
4227 SA[--BUCKET_BSTAR(c0, c1)] = i;
4228 }
4229 t = PAb[m - 1], c0 = T[t], c1 = T[t + 1];
4230 SA[--BUCKET_BSTAR(c0, c1)] = m - 1;
4231
4232 /* Sort the type B* substrings using sssort. */
4233 #ifdef _OPENMP
4234 tmp = omp_get_max_threads();
4235 buf = SA + m, bufsize = (n - (2 * m)) / tmp;
4236 c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m;
4237 #pragma omp parallel default(shared) private(curbuf, k, l, d0, d1, tmp)
4238 {
4239 tmp = omp_get_thread_num();
4240 curbuf = buf + tmp * bufsize;
4241 k = 0;
4242 for(;;) {
4243 #pragma omp critical(sssort_lock)
4244 {
4245 if(0 < (l = j)) {
4246 d0 = c0, d1 = c1;
4247 do {
4248 k = BUCKET_BSTAR(d0, d1);
4249 if(--d1 <= d0) {
4250 d1 = ALPHABET_SIZE - 1;
4251 if(--d0 < 0) { break; }
4252 }
4253 } while(((l - k) <= 1) && (0 < (l = k)));
4254 c0 = d0, c1 = d1, j = k;
4255 }
4256 }
4257 if(l == 0) { break; }
4258 sssort(T, PAb, SA + k, SA + l,
4259 curbuf, bufsize, 2, n, *(SA + k) == (m - 1));
4260 }
4261 }
4262 #else
4263 buf = SA + m, bufsize = n - (2 * m);
4264 for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
4265 for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
4266 i = BUCKET_BSTAR(c0, c1);
4267 if(1 < (j - i)) {
4268 sssort(T, PAb, SA + i, SA + j,
4269 buf, bufsize, 2, n, *(SA + i) == (m - 1));
4270 }
4271 }
4272 }
4273 #endif
4274
4275 /* Compute ranks of type B* substrings. */
4276 for(i = m - 1; 0 <= i; --i) {
4277 if(0 <= SA[i]) {
4278 j = i;
4279 do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i]));
4280 SA[i + 1] = i - j;
4281 if(i <= 0) { break; }
4282 }
4283 j = i;
4284 do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0);
4285 ISAb[SA[i]] = j;
4286 }
4287
4288 /* Construct the inverse suffix array of type B* suffixes using trsort. */
4289 trsort(ISAb, SA, m, 1);
4290
4291 /* Set the sorted order of tyoe B* suffixes. */
4292 for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) {
4293 for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { }
4294 if(0 <= i) {
4295 t = i;
4296 for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { }
4297 SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t;
4298 }
4299 }
4300
4301 /* Calculate the index of start/end point of each bucket. */
4302 BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */
4303 for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) {
4304 i = BUCKET_A(c0 + 1) - 1;
4305 for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) {
4306 t = i - BUCKET_B(c0, c1);
4307 BUCKET_B(c0, c1) = i; /* end point */
4308
4309 /* Move all type B* suffixes to the correct position. */
4310 for(i = t, j = BUCKET_BSTAR(c0, c1);
4311 j <= k;
4312 --i, --k) { SA[i] = SA[k]; }
4313 }
4314 BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */
4315 BUCKET_B(c0, c0) = i; /* end point */
4316 }
4317 }
4318
4319 return m;
4320 }
4321
4322 /* Constructs the suffix array by using the sorted order of type B* suffixes. */
4323 static
4324 void
construct_SA(const unsigned char * T,int * SA,int * bucket_A,int * bucket_B,int n,int m)4325 construct_SA(const unsigned char *T, int *SA,
4326 int *bucket_A, int *bucket_B,
4327 int n, int m) {
4328 int *i, *j, *k;
4329 int s;
4330 int c0, c1, c2;
4331
4332 if(0 < m) {
4333 /* Construct the sorted order of type B suffixes by using
4334 the sorted order of type B* suffixes. */
4335 for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
4336 /* Scan the suffix array from right to left. */
4337 for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
4338 j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
4339 i <= j;
4340 --j) {
4341 if(0 < (s = *j)) {
4342 assert(T[s] == c1);
4343 assert(((s + 1) < n) && (T[s] <= T[s + 1]));
4344 assert(T[s - 1] <= T[s]);
4345 *j = ~s;
4346 c0 = T[--s];
4347 if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
4348 if(c0 != c2) {
4349 if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
4350 k = SA + BUCKET_B(c2 = c0, c1);
4351 }
4352 assert(k < j);
4353 *k-- = s;
4354 } else {
4355 assert(((s == 0) && (T[s] == c1)) || (s < 0));
4356 *j = ~s;
4357 }
4358 }
4359 }
4360 }
4361
4362 /* Construct the suffix array by using
4363 the sorted order of type B suffixes. */
4364 k = SA + BUCKET_A(c2 = T[n - 1]);
4365 *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1);
4366 /* Scan the suffix array from left to right. */
4367 for(i = SA, j = SA + n; i < j; ++i) {
4368 if(0 < (s = *i)) {
4369 assert(T[s - 1] >= T[s]);
4370 c0 = T[--s];
4371 if((s == 0) || (T[s - 1] < c0)) { s = ~s; }
4372 if(c0 != c2) {
4373 BUCKET_A(c2) = k - SA;
4374 k = SA + BUCKET_A(c2 = c0);
4375 }
4376 assert(i < k);
4377 *k++ = s;
4378 } else {
4379 assert(s < 0);
4380 *i = ~s;
4381 }
4382 }
4383 }
4384
4385 /* Constructs the burrows-wheeler transformed string directly
4386 by using the sorted order of type B* suffixes. */
4387 static
4388 int
construct_BWT(const unsigned char * T,int * SA,int * bucket_A,int * bucket_B,int n,int m)4389 construct_BWT(const unsigned char *T, int *SA,
4390 int *bucket_A, int *bucket_B,
4391 int n, int m) {
4392 int *i, *j, *k, *orig;
4393 int s;
4394 int c0, c1, c2;
4395
4396 if(0 < m) {
4397 /* Construct the sorted order of type B suffixes by using
4398 the sorted order of type B* suffixes. */
4399 for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
4400 /* Scan the suffix array from right to left. */
4401 for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
4402 j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
4403 i <= j;
4404 --j) {
4405 if(0 < (s = *j)) {
4406 assert(T[s] == c1);
4407 assert(((s + 1) < n) && (T[s] <= T[s + 1]));
4408 assert(T[s - 1] <= T[s]);
4409 c0 = T[--s];
4410 *j = ~((int)c0);
4411 if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
4412 if(c0 != c2) {
4413 if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
4414 k = SA + BUCKET_B(c2 = c0, c1);
4415 }
4416 assert(k < j);
4417 *k-- = s;
4418 } else if(s != 0) {
4419 *j = ~s;
4420 #ifndef NDEBUG
4421 } else {
4422 assert(T[s] == c1);
4423 #endif
4424 }
4425 }
4426 }
4427 }
4428
4429 /* Construct the BWTed string by using
4430 the sorted order of type B suffixes. */
4431 k = SA + BUCKET_A(c2 = T[n - 1]);
4432 *k++ = (T[n - 2] < c2) ? ~((int)T[n - 2]) : (n - 1);
4433 /* Scan the suffix array from left to right. */
4434 for(i = SA, j = SA + n, orig = SA; i < j; ++i) {
4435 if(0 < (s = *i)) {
4436 assert(T[s - 1] >= T[s]);
4437 c0 = T[--s];
4438 *i = c0;
4439 if((0 < s) && (T[s - 1] < c0)) { s = ~((int)T[s - 1]); }
4440 if(c0 != c2) {
4441 BUCKET_A(c2) = k - SA;
4442 k = SA + BUCKET_A(c2 = c0);
4443 }
4444 assert(i < k);
4445 *k++ = s;
4446 } else if(s != 0) {
4447 *i = ~s;
4448 } else {
4449 orig = i;
4450 }
4451 }
4452
4453 return orig - SA;
4454 }
4455
4456
4457 /*---------------------------------------------------------------------------*/
4458
4459 /*- Function -*/
4460
4461 int
divsufsort(const unsigned char * T,int * SA,int n)4462 divsufsort(const unsigned char *T, int *SA, int n) {
4463 int *bucket_A, *bucket_B;
4464 int m;
4465 int err = 0;
4466
4467 /* Check arguments. */
4468 if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; }
4469 else if(n == 0) { return 0; }
4470 else if(n == 1) { SA[0] = 0; return 0; }
4471 else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; }
4472
4473 bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int));
4474 bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int));
4475
4476 /* Suffixsort. */
4477 if((bucket_A != NULL) && (bucket_B != NULL)) {
4478 m = sort_typeBstar(T, SA, bucket_A, bucket_B, n);
4479 construct_SA(T, SA, bucket_A, bucket_B, n, m);
4480 } else {
4481 err = -2;
4482 }
4483
4484 free(bucket_B);
4485 free(bucket_A);
4486
4487 return err;
4488 }
4489
4490 int
divbwt(const unsigned char * T,unsigned char * U,int * A,int n)4491 divbwt(const unsigned char *T, unsigned char *U, int *A, int n) {
4492 int *B;
4493 int *bucket_A, *bucket_B;
4494 int m, pidx, i;
4495
4496 /* Check arguments. */
4497 if((T == NULL) || (U == NULL) || (n < 0)) { return -1; }
4498 else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; }
4499
4500 if((B = A) == NULL) { B = (int *)malloc((size_t)(n + 1) * sizeof(int)); }
4501 bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int));
4502 bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int));
4503
4504 /* Burrows-Wheeler Transform. */
4505 if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) {
4506 m = sort_typeBstar(T, B, bucket_A, bucket_B, n);
4507 pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m);
4508
4509 /* Copy to output string. */
4510 U[0] = T[n - 1];
4511 for(i = 0; i < pidx; ++i) { U[i + 1] = (unsigned char)B[i]; }
4512 for(i += 1; i < n; ++i) { U[i] = (unsigned char)B[i]; }
4513 pidx += 1;
4514 } else {
4515 pidx = -2;
4516 }
4517
4518 free(bucket_B);
4519 free(bucket_A);
4520 if(A == NULL) { free(B); }
4521
4522 return pidx;
4523 }
4524
4525 // End divsufsort.c
4526
4527 /////////////////////////////// add ///////////////////////////////////
4528
4529 // E8E9 transform of buf[0..n-1] to improve compression of .exe and .dll.
4530 // Patterns (E8|E9 xx xx xx 00|FF) at offset i replace the 3 middle
4531 // bytes with x+i mod 2^24, LSB first, reading backward.
e8e9(unsigned char * buf,int n)4532 void e8e9(unsigned char* buf, int n) {
4533 for (int i=n-5; i>=0; --i) {
4534 if (((buf[i]&254)==0xe8) && ((buf[i+4]+1)&254)==0) {
4535 unsigned a=(buf[i+1]|buf[i+2]<<8|buf[i+3]<<16)+i;
4536 buf[i+1]=a;
4537 buf[i+2]=a>>8;
4538 buf[i+3]=a>>16;
4539 }
4540 }
4541 }
4542
4543 // LZ/BWT preprocessor for levels 1..3 compression and e8e9 filter.
4544 // Level 1 uses variable length LZ77 codes like in the lazy compressor:
4545 //
4546 // 00,n,L[n] = n literal bytes
4547 // mm,mmm,n,ll,r,q (mm > 00) = match 4*n+ll at offset (q<<rb)+r-1
4548 //
4549 // where q is written in 8mm+mmm-8 (0..23) bits with an implied leading 1 bit
4550 // and n is written using interleaved Elias Gamma coding, i.e. the leading
4551 // 1 bit is implied, remaining bits are preceded by a 1 and terminated by
4552 // a 0. e.g. abc is written 1,b,1,c,0. Codes are packed LSB first and
4553 // padded with leading 0 bits in the last byte. r is a number with rb bits,
4554 // where rb = log2(blocksize) - 24.
4555 //
4556 // Level 2 is byte oriented LZ77 with minimum match length m = $4 = args[3]
4557 // with m in 1..64. Lengths and offsets are MSB first:
4558 // 00xxxxxx x+1 (1..64) literals follow
4559 // yyxxxxxx y+1 (2..4) offset bytes follow, match length x+m (m..m+63)
4560 //
4561 // Level 3 is BWT with the end of string byte coded as 255 and the
4562 // last 4 bytes giving its position LSB first.
4563
4564 // floor(log2(x)) + 1 = number of bits excluding leading zeros (0..32)
lg(unsigned x)4565 int lg(unsigned x) {
4566 unsigned r=0;
4567 if (x>=65536) r=16, x>>=16;
4568 if (x>=256) r+=8, x>>=8;
4569 if (x>=16) r+=4, x>>=4;
4570 assert(x>=0 && x<16);
4571 return
4572 "\x00\x01\x02\x02\x03\x03\x03\x03\x04\x04\x04\x04\x04\x04\x04\x04"[x]+r;
4573 }
4574
4575 // return number of 1 bits in x
nbits(unsigned x)4576 int nbits(unsigned x) {
4577 int r;
4578 for (r=0; x; x>>=1) r+=x&1;
4579 return r;
4580 }
4581
4582 // Encode inbuf to buf using LZ77. args are as follows:
4583 // args[0] is log2 buffer size in MB.
4584 // args[1] is level (1=var. length, 2=byte aligned lz77, 3=bwt) + 4 if E8E9.
4585 // args[2] is the lz77 minimum match length and context order.
4586 // args[3] is the lz77 higher context order to search first, or else 0.
4587 // args[4] is the log2 hash bucket size (number of searches).
4588 // args[5] is the log2 hash table size. If 21+args[0] then use a suffix array.
4589 // args[6] is the secondary context look ahead
4590 // sap is pointer to external suffix array of inbuf or 0. If supplied and
4591 // args[0]=5..7 then it is assumed that E8E9 was already applied to
4592 // both the input and sap and the input buffer is not modified.
4593
4594 class LZBuffer: public libzpaq::Reader {
4595 libzpaq::Array<unsigned> ht;// hash table, confirm in low bits, or SA+ISA
4596 const unsigned char* in; // input pointer
4597 const int checkbits; // hash confirmation size or lg(ISA size)
4598 const int level; // 1=var length LZ77, 2=byte aligned LZ77, 3=BWT
4599 const unsigned htsize; // size of hash table
4600 const unsigned n; // input length
4601 unsigned i; // current location in in (0 <= i < n)
4602 const unsigned minMatch; // minimum match length
4603 const unsigned minMatch2; // second context order or 0 if not used
4604 const unsigned maxMatch; // longest match length allowed
4605 const unsigned maxLiteral; // longest literal length allowed
4606 const unsigned lookahead; // second context look ahead
4607 unsigned h1, h2; // low, high order context hashes of in[i..]
4608 const unsigned bucket; // number of matches to search per hash - 1
4609 const unsigned shift1, shift2; // how far to shift h1, h2 per hash
4610 const int minMatchBoth; // max(minMatch, minMatch2)
4611 const unsigned rb; // number of level 1 r bits in match code
4612 unsigned bits; // pending output bits (level 1)
4613 unsigned nbits; // number of bits in bits
4614 unsigned rpos, wpos; // read, write pointers
4615 unsigned idx; // BWT index
4616 const unsigned* sa; // suffix array for BWT or LZ77-SA
4617 unsigned* isa; // inverse suffix array for LZ77-SA
4618 enum {BUFSIZE=1<<14}; // output buffer size
4619 unsigned char buf[BUFSIZE]; // output buffer
4620
4621 void write_literal(unsigned i, unsigned& lit);
4622 void write_match(unsigned len, unsigned off);
4623 void fill(); // encode to buf
4624
4625 // write k bits of x
putb(unsigned x,int k)4626 void putb(unsigned x, int k) {
4627 x&=(1<<k)-1;
4628 bits|=x<<nbits;
4629 nbits+=k;
4630 while (nbits>7) {
4631 assert(wpos<BUFSIZE);
4632 buf[wpos++]=bits, bits>>=8, nbits-=8;
4633 }
4634 }
4635
4636 // write last byte
flush()4637 void flush() {
4638 assert(wpos<BUFSIZE);
4639 if (nbits>0) buf[wpos++]=bits;
4640 bits=nbits=0;
4641 }
4642
4643 // write 1 byte
put(int c)4644 void put(int c) {
4645 assert(wpos<BUFSIZE);
4646 buf[wpos++]=c;
4647 }
4648
4649 public:
4650 LZBuffer(StringBuffer& inbuf, int args[], const unsigned* sap=0);
4651
4652 // return 1 byte of compressed output (overrides Reader)
get()4653 int get() {
4654 int c=-1;
4655 if (rpos==wpos) fill();
4656 if (rpos<wpos) c=buf[rpos++];
4657 if (rpos==wpos) rpos=wpos=0;
4658 return c;
4659 }
4660
4661 // Read up to p[0..n-1] and return bytes read.
4662 int read(char* p, int n);
4663 };
4664
4665 // Read n bytes of compressed output into p and return number of
4666 // bytes read in 0..n. 0 signals EOF (overrides Reader).
read(char * p,int n)4667 int LZBuffer::read(char* p, int n) {
4668 if (rpos==wpos) fill();
4669 int nr=n;
4670 if (nr>int(wpos-rpos)) nr=wpos-rpos;
4671 if (nr) memcpy(p, buf+rpos, nr);
4672 rpos+=nr;
4673 assert(rpos<=wpos);
4674 if (rpos==wpos) rpos=wpos=0;
4675 return nr;
4676 }
4677
LZBuffer(StringBuffer & inbuf,int args[],const unsigned * sap)4678 LZBuffer::LZBuffer(StringBuffer& inbuf, int args[], const unsigned* sap):
4679 ht((args[1]&3)==3 ? (inbuf.size()+1)*!sap // for BWT suffix array
4680 : args[5]-args[0]<21 ? 1u<<args[5] // for LZ77 hash table
4681 : (inbuf.size()*!sap)+(1u<<17<<args[0])), // for LZ77 SA and ISA
4682 in(inbuf.data()),
4683 checkbits(args[5]-args[0]<21 ? 12-args[0] : 17+args[0]),
4684 level(args[1]&3),
4685 htsize(ht.size()),
4686 n(inbuf.size()),
4687 i(0),
4688 minMatch(args[2]),
4689 minMatch2(args[3]),
4690 maxMatch(BUFSIZE*3),
4691 maxLiteral(BUFSIZE/4),
4692 lookahead(args[6]),
4693 h1(0), h2(0),
4694 bucket((1<<args[4])-1),
4695 shift1(minMatch>0 ? (args[5]-1)/minMatch+1 : 1),
4696 shift2(minMatch2>0 ? (args[5]-1)/minMatch2+1 : 0),
4697 minMatchBoth(max(minMatch, minMatch2+lookahead)+4),
4698 rb(args[0]>4 ? args[0]-4 : 0),
4699 bits(0), nbits(0), rpos(0), wpos(0),
4700 idx(0), sa(0), isa(0) {
4701 assert(args[0]>=0);
4702 assert(n<=(1u<<20<<args[0]));
4703 assert(args[1]>=1 && args[1]<=7 && args[1]!=4);
4704 assert(level>=1 && level<=3);
4705 if ((minMatch<4 && level==1) || (minMatch<1 && level==2))
4706 error("match length $3 too small");
4707
4708 // e8e9 transform
4709 if (args[1]>4 && !sap) e8e9(inbuf.data(), n);
4710
4711 // build suffix array if not supplied
4712 if (args[5]-args[0]>=21 || level==3) { // LZ77-SA or BWT
4713 if (sap)
4714 sa=sap;
4715 else {
4716 assert(ht.size()>=n);
4717 assert(ht.size()>0);
4718 sa=&ht[0];
4719 if (n>0) divsufsort((const unsigned char*)in, (int*)sa, n);
4720 }
4721 if (level<3) {
4722 assert(ht.size()>=(n*(sap==0))+(1u<<17<<args[0]));
4723 isa=&ht[n*(sap==0)];
4724 }
4725 }
4726 }
4727
4728 // Encode from in to buf until end of input or buf is not empty
fill()4729 void LZBuffer::fill() {
4730
4731 // BWT
4732 if (level==3) {
4733 assert(in || n==0);
4734 assert(sa);
4735 for (; wpos<BUFSIZE && i<n+5; ++i) {
4736 if (i==0) put(n>0 ? in[n-1] : 255);
4737 else if (i>n) put(idx&255), idx>>=8;
4738 else if (sa[i-1]==0) idx=i, put(255);
4739 else put(in[sa[i-1]-1]);
4740 }
4741 return;
4742 }
4743
4744 // LZ77: scan the input
4745 unsigned lit=0; // number of output literals pending
4746 const unsigned mask=(1<<checkbits)-1;
4747 while (i<n && wpos*2<BUFSIZE) {
4748
4749 // Search for longest match, or pick closest in case of tie
4750 unsigned blen=minMatch-1; // best match length
4751 unsigned bp=0; // pointer to best match
4752 unsigned blit=0; // literals before best match
4753 int bscore=0; // best cost
4754
4755 // Look up contexts in suffix array
4756 if (isa) {
4757 if (sa[isa[i&mask]]!=i) // rebuild ISA
4758 for (unsigned j=0; j<n; ++j)
4759 if ((sa[j]&~mask)==(i&~mask))
4760 isa[sa[j]&mask]=j;
4761 for (unsigned h=0; h<=lookahead; ++h) {
4762 unsigned q=isa[(h+i)&mask]; // location of h+i in SA
4763 assert(q<n);
4764 if (sa[q]!=h+i) continue;
4765 for (int j=-1; j<=1; j+=2) { // search backward and forward
4766 for (unsigned k=1; k<=bucket; ++k) {
4767 unsigned p; // match to be tested
4768 if (q+j*k<n && (p=sa[q+j*k]-h)<i) {
4769 assert(p<n);
4770 unsigned l, l1; // length of match, leading literals
4771 for (l=h; i+l<n && l<maxMatch && in[p+l]==in[i+l]; ++l);
4772 for (l1=h; l1>0 && in[p+l1-1]==in[i+l1-1]; --l1);
4773 int score=int(l-l1)*8-lg(i-p)-4*(lit==0 && l1>0)-11;
4774 for (unsigned a=0; a<h; ++a) score=score*5/8;
4775 if (score>bscore) blen=l, bp=p, blit=l1, bscore=score;
4776 if (l<blen || l<minMatch || l>255) break;
4777 }
4778 }
4779 }
4780 if (bscore<=0 || blen<minMatch) break;
4781 }
4782 }
4783
4784 // Look up contexts in a hash table.
4785 // Try the longest context orders first. If a match is found, then
4786 // skip the lower order as a speed optimization.
4787 else if (level==1 || minMatch<=64) {
4788 if (minMatch2>0) {
4789 for (unsigned k=0; k<=bucket; ++k) {
4790 unsigned p=ht[h2^k];
4791 if (p && (p&mask)==(in[i+3]&mask)) {
4792 p>>=checkbits;
4793 if (p<i && i+blen<=n && in[p+blen-1]==in[i+blen-1]) {
4794 unsigned l; // match length from lookahead
4795 for (l=lookahead; i+l<n && l<maxMatch && in[p+l]==in[i+l]; ++l);
4796 if (l>=minMatch2+lookahead) {
4797 int l1; // length back from lookahead
4798 for (l1=lookahead; l1>0 && in[p+l1-1]==in[i+l1-1]; --l1);
4799 assert(l1>=0 && l1<=int(lookahead));
4800 int score=int(l-l1)*8-lg(i-p)-8*(lit==0 && l1>0)-11;
4801 if (score>bscore) blen=l, bp=p, blit=l1, bscore=score;
4802 }
4803 }
4804 }
4805 if (blen>=128) break;
4806 }
4807 }
4808
4809 // Search the lower order context
4810 if (!minMatch2 || blen<minMatch2) {
4811 for (unsigned k=0; k<=bucket; ++k) {
4812 unsigned p=ht[h1^k];
4813 if (p && (p&mask)==(in[i+3]&mask)) {
4814 p>>=checkbits;
4815 if (p<i && i+blen<=n && in[p+blen-1]==in[i+blen-1]) {
4816 unsigned l;
4817 for (l=0; i+l<n && l<maxMatch && in[p+l]==in[i+l]; ++l);
4818 int score=l*8-lg(i-p)-2*(lit>0)-11;
4819 if (score>bscore) blen=l, bp=p, blit=0, bscore=score;
4820 }
4821 }
4822 if (blen>=128) break;
4823 }
4824 }
4825 }
4826
4827 // If match is long enough, then output any pending literals first,
4828 // and then the match. blen is the length of the match.
4829 assert(i>=bp);
4830 const unsigned off=i-bp; // offset
4831 if (off>0 && bscore>0
4832 && blen-blit>=minMatch+(level==2)*((off>=(1<<16))+(off>=(1<<24)))) {
4833 lit+=blit;
4834 write_literal(i+blit, lit);
4835 write_match(blen-blit, off);
4836 }
4837
4838 // Otherwise add to literal length
4839 else {
4840 blen=1;
4841 ++lit;
4842 }
4843
4844 // Update index, advance blen bytes
4845 if (isa)
4846 i+=blen;
4847 else {
4848 while (blen--) {
4849 if (i+minMatchBoth<n) {
4850 unsigned ih=((i*1234547)>>19)&bucket;
4851 const unsigned p=(i<<checkbits)|(in[i+3]&mask);
4852 assert(ih<=bucket);
4853 if (minMatch2) {
4854 ht[h2^ih]=p;
4855 h2=(((h2*9)<<shift2)
4856 +(in[i+minMatch2+lookahead]+1)*23456789)&(htsize-1);
4857 }
4858 ht[h1^ih]=p;
4859 h1=(((h1*5)<<shift1)+(in[i+minMatch]+1)*123456791)&(htsize-1);
4860 }
4861 ++i;
4862 }
4863 }
4864
4865 // Write long literals to keep buf from filling up
4866 if (lit>=maxLiteral)
4867 write_literal(i, lit);
4868 }
4869
4870 // Write pending literals at end of input
4871 assert(i<=n);
4872 if (i==n) {
4873 write_literal(n, lit);
4874 flush();
4875 }
4876 }
4877
4878 // Write literal sequence in[i-lit..i-1], set lit=0
write_literal(unsigned i,unsigned & lit)4879 void LZBuffer::write_literal(unsigned i, unsigned& lit) {
4880 assert(lit>=0);
4881 assert(i>=0 && i<=n);
4882 assert(i>=lit);
4883 if (level==1) {
4884 if (lit<1) return;
4885 int ll=lg(lit);
4886 assert(ll>=1 && ll<=24);
4887 putb(0, 2);
4888 --ll;
4889 while (--ll>=0) {
4890 putb(1, 1);
4891 putb((lit>>ll)&1, 1);
4892 }
4893 putb(0, 1);
4894 while (lit) putb(in[i-lit--], 8);
4895 }
4896 else {
4897 assert(level==2);
4898 while (lit>0) {
4899 unsigned lit1=lit;
4900 if (lit1>64) lit1=64;
4901 put(lit1-1);
4902 for (unsigned j=i-lit; j<i-lit+lit1; ++j) put(in[j]);
4903 lit-=lit1;
4904 }
4905 }
4906 }
4907
4908 // Write match sequence of given length and offset
write_match(unsigned len,unsigned off)4909 void LZBuffer::write_match(unsigned len, unsigned off) {
4910
4911 // mm,mmm,n,ll,r,q[mmmmm-8] = match n*4+ll, offset ((q-1)<<rb)+r+1
4912 if (level==1) {
4913 assert(len>=minMatch && len<=maxMatch);
4914 assert(off>0);
4915 assert(len>=4);
4916 assert(rb>=0 && rb<=8);
4917 int ll=lg(len)-1;
4918 assert(ll>=2);
4919 off+=(1<<rb)-1;
4920 int lo=lg(off)-1-rb;
4921 assert(lo>=0 && lo<=23);
4922 putb((lo+8)>>3, 2);// mm
4923 putb(lo&7, 3); // mmm
4924 while (--ll>=2) { // n
4925 putb(1, 1);
4926 putb((len>>ll)&1, 1);
4927 }
4928 putb(0, 1);
4929 putb(len&3, 2); // ll
4930 putb(off, rb); // r
4931 putb(off>>rb, lo); // q
4932 }
4933
4934 // x[2]:len[6] off[x-1]
4935 else {
4936 assert(level==2);
4937 assert(minMatch>=1 && minMatch<=64);
4938 --off;
4939 while (len>0) { // Split long matches to len1=minMatch..minMatch+63
4940 const unsigned len1=len>minMatch*2+63 ? minMatch+63 :
4941 len>minMatch+63 ? len-minMatch : len;
4942 assert(wpos<BUFSIZE-5);
4943 assert(len1>=minMatch && len1<minMatch+64);
4944 if (off<(1<<16)) {
4945 put(64+len1-minMatch);
4946 put(off>>8);
4947 put(off);
4948 }
4949 else if (off<(1<<24)) {
4950 put(128+len1-minMatch);
4951 put(off>>16);
4952 put(off>>8);
4953 put(off);
4954 }
4955 else {
4956 put(192+len1-minMatch);
4957 put(off>>24);
4958 put(off>>16);
4959 put(off>>8);
4960 put(off);
4961 }
4962 len-=len1;
4963 }
4964 }
4965 }
4966
4967 // Generate a config file from the method argument with syntax:
4968 // {0|x|s|i}[N1[,N2]...][{ciamtswf<cfg>}[N1[,N2]]...]...
makeConfig(const char * method,int args[])4969 string makeConfig(const char* method, int args[]) {
4970 assert(method);
4971 const char type=method[0];
4972 assert(type=='x' || type=='s' || type=='0' || type=='i');
4973
4974 // Read "{x|s|i|0}N1,N2...N9" into args[0..8] ($1..$9)
4975 args[0]=0; // log block size in MiB
4976 args[1]=0; // 0=none, 1=var-LZ77, 2=byte-LZ77, 3=BWT, 4..7 adds E8E9
4977 args[2]=0; // lz77 minimum match length
4978 args[3]=0; // secondary context length
4979 args[4]=0; // log searches
4980 args[5]=0; // lz77 hash table size or SA if args[0]+21
4981 args[6]=0; // secondary context look ahead
4982 args[7]=0; // not used
4983 args[8]=0; // not used
4984 if (isdigit(*++method)) args[0]=0;
4985 for (int i=0; i<9 && (isdigit(*method) || *method==',' || *method=='.');) {
4986 if (isdigit(*method))
4987 args[i]=args[i]*10+*method-'0';
4988 else if (++i<9)
4989 args[i]=0;
4990 ++method;
4991 }
4992
4993 // "0..." = No compression
4994 if (type=='0')
4995 return "comp 0 0 0 0 0 hcomp end\n";
4996
4997 // Generate the postprocessor
4998 string hdr, pcomp;
4999 const int level=args[1]&3;
5000 const bool doe8=args[1]>=4 && args[1]<=7;
5001
5002 // LZ77+Huffman, with or without E8E9
5003 if (level==1) {
5004 const int rb=args[0]>4 ? args[0]-4 : 0;
5005 hdr="comp 9 16 0 $1+20 ";
5006 pcomp=
5007 "pcomp lazy2 3 ;\n"
5008 " (r1 = state\n"
5009 " r2 = len - match or literal length\n"
5010 " r3 = m - number of offset bits expected\n"
5011 " r4 = ptr to buf\n"
5012 " r5 = r - low bits of offset\n"
5013 " c = bits - input buffer\n"
5014 " d = n - number of bits in c)\n"
5015 "\n"
5016 " a> 255 if\n";
5017 if (doe8)
5018 pcomp+=
5019 " b=0 d=r 4 do (for b=0..d-1, d = end of buf)\n"
5020 " a=b a==d ifnot\n"
5021 " a+= 4 a<d if\n"
5022 " a=*b a&= 254 a== 232 if (e8 or e9?)\n"
5023 " c=b b++ b++ b++ b++ a=*b a++ a&= 254 a== 0 if (00 or ff)\n"
5024 " b-- a=*b\n"
5025 " b-- a<<= 8 a+=*b\n"
5026 " b-- a<<= 8 a+=*b\n"
5027 " a-=b a++\n"
5028 " *b=a a>>= 8 b++\n"
5029 " *b=a a>>= 8 b++\n"
5030 " *b=a b++\n"
5031 " endif\n"
5032 " b=c\n"
5033 " endif\n"
5034 " endif\n"
5035 " a=*b out b++\n"
5036 " forever\n"
5037 " endif\n"
5038 "\n";
5039 pcomp+=
5040 " (reset state)\n"
5041 " a=0 b=0 c=0 d=0 r=a 1 r=a 2 r=a 3 r=a 4\n"
5042 " halt\n"
5043 " endif\n"
5044 "\n"
5045 " a<<=d a+=c c=a (bits+=a<<n)\n"
5046 " a= 8 a+=d d=a (n+=8)\n"
5047 "\n"
5048 " (if state==0 (expect new code))\n"
5049 " a=r 1 a== 0 if (match code mm,mmm)\n"
5050 " a= 1 r=a 2 (len=1)\n"
5051 " a=c a&= 3 a> 0 if (if (bits&3))\n"
5052 " a-- a<<= 3 r=a 3 (m=((bits&3)-1)*8)\n"
5053 " a=c a>>= 2 c=a (bits>>=2)\n"
5054 " b=r 3 a&= 7 a+=b r=a 3 (m+=bits&7)\n"
5055 " a=c a>>= 3 c=a (bits>>=3)\n"
5056 " a=d a-= 5 d=a (n-=5)\n"
5057 " a= 1 r=a 1 (state=1)\n"
5058 " else (literal, discard 00)\n"
5059 " a=c a>>= 2 c=a (bits>>=2)\n"
5060 " d-- d-- (n-=2)\n"
5061 " a= 3 r=a 1 (state=3)\n"
5062 " endif\n"
5063 " endif\n"
5064 "\n"
5065 " (while state==1 && n>=3 (expect match length n*4+ll -> r2))\n"
5066 " do a=r 1 a== 1 if a=d a> 2 if\n"
5067 " a=c a&= 1 a== 1 if (if bits&1)\n"
5068 " a=c a>>= 1 c=a (bits>>=1)\n"
5069 " b=r 2 a=c a&= 1 a+=b a+=b r=a 2 (len+=len+(bits&1))\n"
5070 " a=c a>>= 1 c=a (bits>>=1)\n"
5071 " d-- d-- (n-=2)\n"
5072 " else\n"
5073 " a=c a>>= 1 c=a (bits>>=1)\n"
5074 " a=r 2 a<<= 2 b=a (len<<=2)\n"
5075 " a=c a&= 3 a+=b r=a 2 (len+=bits&3)\n"
5076 " a=c a>>= 2 c=a (bits>>=2)\n"
5077 " d-- d-- d-- (n-=3)\n";
5078 if (rb)
5079 pcomp+=" a= 5 r=a 1 (state=5)\n";
5080 else
5081 pcomp+=" a= 2 r=a 1 (state=2)\n";
5082 pcomp+=
5083 " endif\n"
5084 " forever endif endif\n"
5085 "\n";
5086 if (rb) pcomp+= // save r in r5
5087 " (if state==5 && n>=8) (expect low bits of offset to put in r5)\n"
5088 " a=r 1 a== 5 if a=d a> "+itos(rb-1)+" if\n"
5089 " a=c a&= "+itos((1<<rb)-1)+" r=a 5 (save r in r5)\n"
5090 " a=c a>>= "+itos(rb)+" c=a\n"
5091 " a=d a-= "+itos(rb)+ " d=a\n"
5092 " a= 2 r=a 1 (go to state 2)\n"
5093 " endif endif\n"
5094 "\n";
5095 pcomp+=
5096 " (if state==2 && n>=m) (expect m offset bits)\n"
5097 " a=r 1 a== 2 if a=r 3 a>d ifnot\n"
5098 " a=c r=a 6 a=d r=a 7 (save c=bits, d=n in r6,r7)\n"
5099 " b=r 3 a= 1 a<<=b d=a (d=1<<m)\n"
5100 " a-- a&=c a+=d (d=offset=bits&((1<<m)-1)|(1<<m))\n";
5101 if (rb)
5102 pcomp+= // insert r into low bits of d
5103 " a<<= "+itos(rb)+" d=r 5 a+=d a-= "+itos((1<<rb)-1)+"\n";
5104 pcomp+=
5105 " d=a b=r 4 a=b a-=d c=a (c=p=(b=ptr)-offset)\n"
5106 "\n"
5107 " (while len-- (copy and output match d bytes from *c to *b))\n"
5108 " d=r 2 do a=d a> 0 if d--\n"
5109 " a=*c *b=a c++ b++ (buf[ptr++]-buf[p++])\n";
5110 if (!doe8) pcomp+=" out\n";
5111 pcomp+=
5112 " forever endif\n"
5113 " a=b r=a 4\n"
5114 "\n"
5115 " a=r 6 b=r 3 a>>=b c=a (bits>>=m)\n"
5116 " a=r 7 a-=b d=a (n-=m)\n"
5117 " a=0 r=a 1 (state=0)\n"
5118 " endif endif\n"
5119 "\n"
5120 " (while state==3 && n>=2 (expect literal length))\n"
5121 " do a=r 1 a== 3 if a=d a> 1 if\n"
5122 " a=c a&= 1 a== 1 if (if bits&1)\n"
5123 " a=c a>>= 1 c=a (bits>>=1)\n"
5124 " b=r 2 a&= 1 a+=b a+=b r=a 2 (len+=len+(bits&1))\n"
5125 " a=c a>>= 1 c=a (bits>>=1)\n"
5126 " d-- d-- (n-=2)\n"
5127 " else\n"
5128 " a=c a>>= 1 c=a (bits>>=1)\n"
5129 " d-- (--n)\n"
5130 " a= 4 r=a 1 (state=4)\n"
5131 " endif\n"
5132 " forever endif endif\n"
5133 "\n"
5134 " (if state==4 && n>=8 (expect len literals))\n"
5135 " a=r 1 a== 4 if a=d a> 7 if\n"
5136 " b=r 4 a=c *b=a\n";
5137 if (!doe8) pcomp+=" out\n";
5138 pcomp+=
5139 " b++ a=b r=a 4 (buf[ptr++]=bits)\n"
5140 " a=c a>>= 8 c=a (bits>>=8)\n"
5141 " a=d a-= 8 d=a (n-=8)\n"
5142 " a=r 2 a-- r=a 2 a== 0 if (if --len<1)\n"
5143 " a=0 r=a 1 (state=0)\n"
5144 " endif\n"
5145 " endif endif\n"
5146 " halt\n"
5147 "end\n";
5148 }
5149
5150 // Byte aligned LZ77, with or without E8E9
5151 else if (level==2) {
5152 hdr="comp 9 16 0 $1+20 ";
5153 pcomp=
5154 "pcomp lzpre c ;\n"
5155 " (Decode LZ77: d=state, M=output buffer, b=size)\n"
5156 " a> 255 if (at EOF decode e8e9 and output)\n";
5157 if (doe8)
5158 pcomp+=
5159 " d=b b=0 do (for b=0..d-1, d = end of buf)\n"
5160 " a=b a==d ifnot\n"
5161 " a+= 4 a<d if\n"
5162 " a=*b a&= 254 a== 232 if (e8 or e9?)\n"
5163 " c=b b++ b++ b++ b++ a=*b a++ a&= 254 a== 0 if (00 or ff)\n"
5164 " b-- a=*b\n"
5165 " b-- a<<= 8 a+=*b\n"
5166 " b-- a<<= 8 a+=*b\n"
5167 " a-=b a++\n"
5168 " *b=a a>>= 8 b++\n"
5169 " *b=a a>>= 8 b++\n"
5170 " *b=a b++\n"
5171 " endif\n"
5172 " b=c\n"
5173 " endif\n"
5174 " endif\n"
5175 " a=*b out b++\n"
5176 " forever\n"
5177 " endif\n";
5178 pcomp+=
5179 " b=0 c=0 d=0 a=0 r=a 1 r=a 2 (reset state)\n"
5180 " halt\n"
5181 " endif\n"
5182 "\n"
5183 " (in state d==0, expect a new code)\n"
5184 " (put length in r1 and inital part of offset in r2)\n"
5185 " c=a a=d a== 0 if\n"
5186 " a=c a>>= 6 a++ d=a\n"
5187 " a== 1 if (literal?)\n"
5188 " a+=c r=a 1 a=0 r=a 2\n"
5189 " else (3 to 5 byte match)\n"
5190 " d++ a=c a&= 63 a+= $3 r=a 1 a=0 r=a 2\n"
5191 " endif\n"
5192 " else\n"
5193 " a== 1 if (writing literal)\n"
5194 " a=c *b=a b++\n";
5195 if (!doe8) pcomp+=" out\n";
5196 pcomp+=
5197 " a=r 1 a-- a== 0 if d=0 endif r=a 1 (if (--len==0) state=0)\n"
5198 " else\n"
5199 " a> 2 if (reading offset)\n"
5200 " a=r 2 a<<= 8 a|=c r=a 2 d-- (off=off<<8|c, --state)\n"
5201 " else (state==2, write match)\n"
5202 " a=r 2 a<<= 8 a|=c c=a a=b a-=c a-- c=a (c=i-off-1)\n"
5203 " d=r 1 (d=len)\n"
5204 " do (copy and output d=len bytes)\n"
5205 " a=*c *b=a c++ b++\n";
5206 if (!doe8) pcomp+=" out\n";
5207 pcomp+=
5208 " d-- a=d a> 0 while\n"
5209 " (d=state=0. off, len don\'t matter)\n"
5210 " endif\n"
5211 " endif\n"
5212 " endif\n"
5213 " halt\n"
5214 "end\n";
5215 }
5216
5217 // BWT with or without E8E9
5218 else if (level==3) { // IBWT
5219 hdr="comp 9 16 $1+20 $1+20 "; // 2^$1 = block size in MB
5220 pcomp=
5221 "pcomp bwtrle c ;\n"
5222 "\n"
5223 " (read BWT, index into M, size in b)\n"
5224 " a> 255 ifnot\n"
5225 " *b=a b++\n"
5226 "\n"
5227 " (inverse BWT)\n"
5228 " elsel\n"
5229 "\n"
5230 " (index in last 4 bytes, put in c and R1)\n"
5231 " b-- a=*b\n"
5232 " b-- a<<= 8 a+=*b\n"
5233 " b-- a<<= 8 a+=*b\n"
5234 " b-- a<<= 8 a+=*b c=a r=a 1\n"
5235 "\n"
5236 " (save size in R2)\n"
5237 " a=b r=a 2\n"
5238 "\n"
5239 " (count bytes in H[~1..~255, ~0])\n"
5240 " do\n"
5241 " a=b a> 0 if\n"
5242 " b-- a=*b a++ a&= 255 d=a d! *d++\n"
5243 " forever\n"
5244 " endif\n"
5245 "\n"
5246 " (cumulative counts: H[~i=0..255] = count of bytes before i)\n"
5247 " d=0 d! *d= 1 a=0\n"
5248 " do\n"
5249 " a+=*d *d=a d--\n"
5250 " d<>a a! a> 255 a! d<>a until\n"
5251 "\n"
5252 " (build first part of linked list in H[0..idx-1])\n"
5253 " b=0 do\n"
5254 " a=c a>b if\n"
5255 " d=*b d! *d++ d=*d d-- *d=b\n"
5256 " b++ forever\n"
5257 " endif\n"
5258 "\n"
5259 " (rest of list in H[idx+1..n-1])\n"
5260 " b=c b++ c=r 2 do\n"
5261 " a=c a>b if\n"
5262 " d=*b d! *d++ d=*d d-- *d=b\n"
5263 " b++ forever\n"
5264 " endif\n"
5265 "\n";
5266 if (args[0]<=4) { // faster IBWT list traversal limited to 16 MB blocks
5267 pcomp+=
5268 " (copy M to low 8 bits of H to reduce cache misses in next loop)\n"
5269 " b=0 do\n"
5270 " a=c a>b if\n"
5271 " d=b a=*d a<<= 8 a+=*b *d=a\n"
5272 " b++ forever\n"
5273 " endif\n"
5274 "\n"
5275 " (traverse list and output or copy to M)\n"
5276 " d=r 1 b=0 do\n"
5277 " a=d a== 0 ifnot\n"
5278 " a=*d a>>= 8 d=a\n";
5279 if (doe8) pcomp+=" *b=*d b++\n";
5280 else pcomp+=" a=*d out\n";
5281 pcomp+=
5282 " forever\n"
5283 " endif\n"
5284 "\n";
5285 if (doe8) // IBWT+E8E9
5286 pcomp+=
5287 " (e8e9 transform to out)\n"
5288 " d=b b=0 do (for b=0..d-1, d = end of buf)\n"
5289 " a=b a==d ifnot\n"
5290 " a+= 4 a<d if\n"
5291 " a=*b a&= 254 a== 232 if\n"
5292 " c=b b++ b++ b++ b++ a=*b a++ a&= 254 a== 0 if\n"
5293 " b-- a=*b\n"
5294 " b-- a<<= 8 a+=*b\n"
5295 " b-- a<<= 8 a+=*b\n"
5296 " a-=b a++\n"
5297 " *b=a a>>= 8 b++\n"
5298 " *b=a a>>= 8 b++\n"
5299 " *b=a b++\n"
5300 " endif\n"
5301 " b=c\n"
5302 " endif\n"
5303 " endif\n"
5304 " a=*b out b++\n"
5305 " forever\n"
5306 " endif\n";
5307 pcomp+=
5308 " endif\n"
5309 " halt\n"
5310 "end\n";
5311 }
5312 else { // slower IBWT list traversal for all sized blocks
5313 if (doe8) { // E8E9 after IBWT
5314 pcomp+=
5315 " (R2 = output size without EOS)\n"
5316 " a=r 2 a-- r=a 2\n"
5317 "\n"
5318 " (traverse list (d = IBWT pointer) and output inverse e8e9)\n"
5319 " (C = offset = 0..R2-1)\n"
5320 " (R4 = last 4 bytes shifted in from MSB end)\n"
5321 " (R5 = temp pending output byte)\n"
5322 " c=0 d=r 1 do\n"
5323 " a=d a== 0 ifnot\n"
5324 " d=*d\n"
5325 "\n"
5326 " (store byte in R4 and shift out to R5)\n"
5327 " b=d a=*b a<<= 24 b=a\n"
5328 " a=r 4 r=a 5 a>>= 8 a|=b r=a 4\n"
5329 "\n"
5330 " (if E8|E9 xx xx xx 00|FF in R4:R5 then subtract c from x)\n"
5331 " a=c a> 3 if\n"
5332 " a=r 5 a&= 254 a== 232 if\n"
5333 " a=r 4 a>>= 24 b=a a++ a&= 254 a< 2 if\n"
5334 " a=r 4 a-=c a+= 4 a<<= 8 a>>= 8 \n"
5335 " b<>a a<<= 24 a+=b r=a 4\n"
5336 " endif\n"
5337 " endif\n"
5338 " endif\n"
5339 "\n"
5340 " (output buffered byte)\n"
5341 " a=c a> 3 if a=r 5 out endif c++\n"
5342 "\n"
5343 " forever\n"
5344 " endif\n"
5345 "\n"
5346 " (output up to 4 pending bytes in R4)\n"
5347 " b=r 4\n"
5348 " a=c a> 3 a=b if out endif a>>= 8 b=a\n"
5349 " a=c a> 2 a=b if out endif a>>= 8 b=a\n"
5350 " a=c a> 1 a=b if out endif a>>= 8 b=a\n"
5351 " a=c a> 0 a=b if out endif\n"
5352 "\n"
5353 " endif\n"
5354 " halt\n"
5355 "end\n";
5356 }
5357 else {
5358 pcomp+=
5359 " (traverse list and output)\n"
5360 " d=r 1 do\n"
5361 " a=d a== 0 ifnot\n"
5362 " d=*d\n"
5363 " b=d a=*b out\n"
5364 " forever\n"
5365 " endif\n"
5366 " endif\n"
5367 " halt\n"
5368 "end\n";
5369 }
5370 }
5371 }
5372
5373 // E8E9 or no preprocessing
5374 else if (level==0) {
5375 hdr="comp 9 16 0 0 ";
5376 if (doe8) { // E8E9?
5377 pcomp=
5378 "pcomp e8e9 d ;\n"
5379 " a> 255 if\n"
5380 " a=c a> 4 if\n"
5381 " c= 4\n"
5382 " else\n"
5383 " a! a+= 5 a<<= 3 d=a a=b a>>=d b=a\n"
5384 " endif\n"
5385 " do a=c a> 0 if\n"
5386 " a=b out a>>= 8 b=a c--\n"
5387 " forever endif\n"
5388 " else\n"
5389 " *b=b a<<= 24 d=a a=b a>>= 8 a+=d b=a c++\n"
5390 " a=c a> 4 if\n"
5391 " a=*b out\n"
5392 " a&= 254 a== 232 if\n"
5393 " a=b a>>= 24 a++ a&= 254 a== 0 if\n"
5394 " a=b a>>= 24 a<<= 24 d=a\n"
5395 " a=b a-=c a+= 5\n"
5396 " a<<= 8 a>>= 8 a|=d b=a\n"
5397 " endif\n"
5398 " endif\n"
5399 " endif\n"
5400 " endif\n"
5401 " halt\n"
5402 "end\n";
5403 }
5404 else
5405 pcomp="end\n";
5406 }
5407 else
5408 error("Unsupported method");
5409
5410 // Build context model (comp, hcomp) assuming:
5411 // H[0..254] = contexts
5412 // H[255..511] = location of last byte i-255
5413 // M = last 64K bytes, filling backward
5414 // C = pointer to most recent byte
5415 // R1 = level 2 lz77 1+bytes expected until next code, 0=init
5416 // R2 = level 2 lz77 first byte of code
5417 int ncomp=0; // number of components
5418 const int membits=args[0]+20;
5419 int sb=5; // bits in last context
5420 string comp;
5421 string hcomp="hcomp\n"
5422 "c-- *c=a a+= 255 d=a *d=c\n";
5423 if (level==2) { // put level 2 lz77 parse state in R1, R2
5424 hcomp+=
5425 " (decode lz77 into M. Codes:\n"
5426 " 00xxxxxx = literal length xxxxxx+1\n"
5427 " xx......, xx > 0 = match with xx offset bytes to follow)\n"
5428 "\n"
5429 " a=r 1 a== 0 if (init)\n"
5430 " a= "+itos(111+57*doe8)+" (skip post code)\n"
5431 " else a== 1 if (new code?)\n"
5432 " a=*c r=a 2 (save code in R2)\n"
5433 " a> 63 if a>>= 6 a++ a++ (match)\n"
5434 " else a++ a++ endif (literal)\n"
5435 " else (read rest of code)\n"
5436 " a--\n"
5437 " endif endif\n"
5438 " r=a 1 (R1 = 1+expected bytes to next code)\n";
5439 }
5440
5441 // Generate the context model
5442 while (*method && ncomp<254) {
5443
5444 // parse command C[N1[,N2]...] into v = {C, N1, N2...}
5445 vector<int> v;
5446 v.push_back(*method++);
5447 if (isdigit(*method)) {
5448 v.push_back(*method++-'0');
5449 while (isdigit(*method) || *method==',' || *method=='.') {
5450 if (isdigit(*method))
5451 v.back()=v.back()*10+*method++-'0';
5452 else {
5453 v.push_back(0);
5454 ++method;
5455 }
5456 }
5457 }
5458
5459 // c: context model
5460 // N1%1000: 0=ICM 1..256=CM limit N1-1
5461 // N1/1000: number of times to halve memory
5462 // N2: 1..255=offset mod N2. 1000..1255=distance to N2-1000
5463 // N3...: 0..255=byte mask + 256=lz77 state. 1000+=run of N3-1000 zeros.
5464 if (v[0]=='c') {
5465 while (v.size()<3) v.push_back(0);
5466 comp+=itos(ncomp)+" ";
5467 sb=11; // count context bits
5468 if (v[2]<256) sb+=lg(v[2]);
5469 else sb+=6;
5470 for (unsigned i=3; i<v.size(); ++i)
5471 if (v[i]<512) sb+=nbits(v[i])*3/4;
5472 if (sb>membits) sb=membits;
5473 if (v[1]%1000==0) comp+="icm "+itos(sb-6-v[1]/1000)+"\n";
5474 else comp+="cm "+itos(sb-2-v[1]/1000)+" "+itos(v[1]%1000-1)+"\n";
5475
5476 // special contexts
5477 hcomp+="d= "+itos(ncomp)+" *d=0\n";
5478 if (v[2]>1 && v[2]<=255) { // periodic context
5479 if (lg(v[2])!=lg(v[2]-1))
5480 hcomp+="a=c a&= "+itos(v[2]-1)+" hashd\n";
5481 else
5482 hcomp+="a=c a%= "+itos(v[2])+" hashd\n";
5483 }
5484 else if (v[2]>=1000 && v[2]<=1255) // distance context
5485 hcomp+="a= 255 a+= "+itos(v[2]-1000)+
5486 " d=a a=*d a-=c a> 255 if a= 255 endif d= "+
5487 itos(ncomp)+" hashd\n";
5488
5489 // Masked context
5490 for (unsigned i=3; i<v.size(); ++i) {
5491 if (i==3) hcomp+="b=c ";
5492 if (v[i]==255)
5493 hcomp+="a=*b hashd\n"; // ordinary byte
5494 else if (v[i]>0 && v[i]<255)
5495 hcomp+="a=*b a&= "+itos(v[i])+" hashd\n"; // masked byte
5496 else if (v[i]>=256 && v[i]<512) { // lz77 state or masked literal byte
5497 hcomp+=
5498 "a=r 1 a> 1 if\n" // expect literal or offset
5499 " a=r 2 a< 64 if\n" // expect literal
5500 " a=*b ";
5501 if (v[i]<511) hcomp+="a&= "+itos(v[i]-256);
5502 hcomp+=" hashd\n"
5503 " else\n" // expect match offset byte
5504 " a>>= 6 hashd a=r 1 hashd\n"
5505 " endif\n"
5506 "else\n" // expect new code
5507 " a= 255 hashd a=r 2 hashd\n"
5508 "endif\n";
5509 }
5510 else if (v[i]>=1256) // skip v[i]-1000 bytes
5511 hcomp+="a= "+itos(((v[i]-1000)>>8)&255)+" a<<= 8 a+= "
5512 +itos((v[i]-1000)&255)+
5513 " a+=b b=a\n";
5514 else if (v[i]>1000)
5515 hcomp+="a= "+itos(v[i]-1000)+" a+=b b=a\n";
5516 if (v[i]<512 && i<v.size()-1)
5517 hcomp+="b++ ";
5518 }
5519 ++ncomp;
5520 }
5521
5522 // m,8,24: MIX, size, rate
5523 // t,8,24: MIX2, size, rate
5524 // s,8,32,255: SSE, size, start, limit
5525 if (strchr("mts", v[0]) && ncomp>int(v[0]=='t')) {
5526 if (v.size()<=1) v.push_back(8);
5527 if (v.size()<=2) v.push_back(24+8*(v[0]=='s'));
5528 if (v[0]=='s' && v.size()<=3) v.push_back(255);
5529 comp+=itos(ncomp);
5530 sb=5+v[1]*3/4;
5531 if (v[0]=='m')
5532 comp+=" mix "+itos(v[1])+" 0 "+itos(ncomp)+" "+itos(v[2])+" 255\n";
5533 else if (v[0]=='t')
5534 comp+=" mix2 "+itos(v[1])+" "+itos(ncomp-1)+" "+itos(ncomp-2)
5535 +" "+itos(v[2])+" 255\n";
5536 else // s
5537 comp+=" sse "+itos(v[1])+" "+itos(ncomp-1)+" "+itos(v[2])+" "
5538 +itos(v[3])+"\n";
5539 if (v[1]>8) {
5540 hcomp+="d= "+itos(ncomp)+" *d=0 b=c a=0\n";
5541 for (; v[1]>=16; v[1]-=8) {
5542 hcomp+="a<<= 8 a+=*b";
5543 if (v[1]>16) hcomp+=" b++";
5544 hcomp+="\n";
5545 }
5546 if (v[1]>8)
5547 hcomp+="a<<= 8 a+=*b a>>= "+itos(16-v[1])+"\n";
5548 hcomp+="a<<= 8 *d=a\n";
5549 }
5550 ++ncomp;
5551 }
5552
5553 // i: ISSE chain with order increasing by N1,N2...
5554 if (v[0]=='i' && ncomp>0) {
5555 assert(sb>=5);
5556 hcomp+="d= "+itos(ncomp-1)+" b=c a=*d d++\n";
5557 for (unsigned i=1; i<v.size() && ncomp<254; ++i) {
5558 for (int j=0; j<v[i]%10; ++j) {
5559 hcomp+="hash ";
5560 if (i<v.size()-1 || j<v[i]%10-1) hcomp+="b++ ";
5561 sb+=6;
5562 }
5563 hcomp+="*d=a";
5564 if (i<v.size()-1) hcomp+=" d++";
5565 hcomp+="\n";
5566 if (sb>membits) sb=membits;
5567 comp+=itos(ncomp)+" isse "+itos(sb-6-v[i]/10)+" "+itos(ncomp-1)+"\n";
5568 ++ncomp;
5569 }
5570 }
5571
5572 // a24,0,0: MATCH. N1=hash multiplier. N2,N3=halve buf, table.
5573 if (v[0]=='a') {
5574 if (v.size()<=1) v.push_back(24);
5575 while (v.size()<4) v.push_back(0);
5576 comp+=itos(ncomp)+" match "+itos(membits-v[3]-2)+" "
5577 +itos(membits-v[2])+"\n";
5578 hcomp+="d= "+itos(ncomp)+" a=*d a*= "+itos(v[1])
5579 +" a+=*c a++ *d=a\n";
5580 sb=5+(membits-v[2])*3/4;
5581 ++ncomp;
5582 }
5583
5584 // w1,65,26,223,20,0: ICM-ISSE chain of length N1 with word contexts,
5585 // where a word is a sequence of c such that c&N4 is in N2..N2+N3-1.
5586 // Word is hashed by: hash := hash*N5+c+1
5587 // Decrease memory by 2^-N6.
5588 if (v[0]=='w') {
5589 if (v.size()<=1) v.push_back(1);
5590 if (v.size()<=2) v.push_back(65);
5591 if (v.size()<=3) v.push_back(26);
5592 if (v.size()<=4) v.push_back(223);
5593 if (v.size()<=5) v.push_back(20);
5594 if (v.size()<=6) v.push_back(0);
5595 comp+=itos(ncomp)+" icm "+itos(membits-6-v[6])+"\n";
5596 for (int i=1; i<v[1]; ++i)
5597 comp+=itos(ncomp+i)+" isse "+itos(membits-6-v[6])+" "
5598 +itos(ncomp+i-1)+"\n";
5599 hcomp+="a=*c a&= "+itos(v[4])+" a-= "+itos(v[2])+" a&= 255 a< "
5600 +itos(v[3])+" if\n";
5601 for (int i=0; i<v[1]; ++i) {
5602 if (i==0) hcomp+=" d= "+itos(ncomp);
5603 else hcomp+=" d++";
5604 hcomp+=" a=*d a*= "+itos(v[5])+" a+=*c a++ *d=a\n";
5605 }
5606 hcomp+="else\n";
5607 for (int i=v[1]-1; i>0; --i)
5608 hcomp+=" d= "+itos(ncomp+i-1)+" a=*d d++ *d=a\n";
5609 hcomp+=" d= "+itos(ncomp)+" *d=0\n"
5610 "endif\n";
5611 ncomp+=v[1]-1;
5612 sb=membits-v[6];
5613 ++ncomp;
5614 }
5615
5616 // Read from config file and ignore rest of command
5617 if (v[0]=='f') {
5618 string filename=method; // append .cfg if not already
5619 int len=filename.size();
5620 if (len<=4 || filename.substr(len-4)!=".cfg") filename+=".cfg";
5621 FILE* in=fopen(filename.c_str(), "r");
5622 if (!in) {
5623 perror(filename.c_str());
5624 error("Config file not found");
5625 }
5626 string cfg;
5627 int c;
5628 while ((c=getc(in))!=EOF) cfg+=(char)c;
5629 fclose(in);
5630 return cfg;
5631 }
5632 }
5633 return hdr+itos(ncomp)+"\n"+comp+hcomp+"halt\n"+pcomp;
5634 }
5635
5636 // Compress from in to out in 1 segment in 1 block using the algorithm
5637 // descried in method. If method begins with a digit then choose
5638 // a method depending on type. Save filename and comment
5639 // in the segment header. If comment is 0 then the default is the input size
5640 // as a decimal string, plus " jDC\x01" for a journaling method (method[0]
5641 // is not 's'). type is set as follows: bits 9-2 estimate compressibility
5642 // where 0 means random. Bit 1 indicates x86 (exe or dll) and bit 0
5643 // indicates English text.
compressBlock(StringBuffer * in,libzpaq::Writer * out,string method,const char * filename=0,const char * comment=0,unsigned type=512)5644 string compressBlock(StringBuffer* in, libzpaq::Writer* out, string method,
5645 const char* filename=0, const char* comment=0,
5646 unsigned type=512) {
5647 assert(in);
5648 assert(out);
5649 assert(method!="");
5650 const unsigned n=in->size(); // input size
5651 const int arg0=max(lg(n+4095)-20, 0); // block size
5652 assert((1u<<(arg0+20))>=n+4096);
5653
5654 // Get hash of input
5655 libzpaq::SHA1 sha1;
5656 const char* sha1ptr=0;
5657 if (!fragile) {
5658 for (const char* p=in->c_str(), *end=p+n; p<end; ++p)
5659 sha1.put(*p);
5660 sha1ptr=sha1.result();
5661 }
5662
5663 // Expand default methods
5664 if (isdigit(method[0])) {
5665 const int level=method[0]-'0';
5666 assert(level>=0 && level<=9);
5667
5668 // build models
5669 const int doe8=(type&2)*2;
5670 method="x"+itos(arg0);
5671 string htsz=","+itos(19+arg0+(arg0<=6)); // lz77 hash table size
5672 string sasz=","+itos(21+arg0); // lz77 suffix array size
5673
5674 // store uncompressed
5675 if (level==0)
5676 method="0"+itos(arg0)+",0";
5677
5678 // LZ77, no model. Store if hard to compress
5679 else if (level==1) {
5680 if (type<40) method+=",0";
5681 else {
5682 method+=","+itos(1+doe8)+",";
5683 if (type<80) method+="4,0,1,15";
5684 else if (type<128) method+="4,0,2,16";
5685 else if (type<256) method+="4,0,2"+htsz;
5686 else if (type<960) method+="5,0,3"+htsz;
5687 else method+="6,0,3"+htsz;
5688 }
5689 }
5690
5691 // LZ77 with longer search
5692 else if (level==2) {
5693 if (type<32) method+=",0";
5694 else {
5695 method+=","+itos(1+doe8)+",";
5696 if (type<64) method+="4,0,3"+htsz;
5697 else method+="4,0,7"+sasz+",1";
5698 }
5699 }
5700
5701 // LZ77 with CM depending on redundancy
5702 else if (level==3) {
5703 if (type<20) // store if not compressible
5704 method+=",0";
5705 else if (type<48) // fast LZ77 if barely compressible
5706 method+=","+itos(1+doe8)+",4,0,3"+htsz;
5707 else if (type>=640 || (type&1)) // BWT if text or highly compressible
5708 method+=","+itos(3+doe8)+"ci1";
5709 else // LZ77 with O0-1 compression of up to 12 literals
5710 method+=","+itos(2+doe8)+",12,0,7"+sasz+",1c0,0,511i2";
5711 }
5712
5713 // LZ77+CM, fast CM, or BWT depending on type
5714 else if (level==4) {
5715 if (type<12)
5716 method+=",0";
5717 else if (type<24)
5718 method+=","+itos(1+doe8)+",4,0,3"+htsz;
5719 else if (type<48)
5720 method+=","+itos(2+doe8)+",5,0,7"+sasz+"1c0,0,511";
5721 else if (type<900) {
5722 method+=","+itos(doe8)+"ci1,1,1,1,2a";
5723 if (type&1) method+="w";
5724 method+="m";
5725 }
5726 else
5727 method+=","+itos(3+doe8)+"ci1";
5728 }
5729
5730 // Slow CM with lots of models
5731 else { // 5..9
5732
5733 // Model text files
5734 method+=","+itos(doe8);
5735 if (type&1) method+="w2c0,1010,255i1";
5736 else method+="w1i1";
5737 method+="c256ci1,1,1,1,1,1,2a";
5738
5739 // Analyze the data
5740 const int NR=1<<12;
5741 int pt[256]={0}; // position of last occurrence
5742 int r[NR]={0}; // count repetition gaps of length r
5743 const unsigned char* p=in->data();
5744 if (level>0) {
5745 for (unsigned i=0; i<n; ++i) {
5746 const int k=i-pt[p[i]];
5747 if (k>0 && k<NR) ++r[k];
5748 pt[p[i]]=i;
5749 }
5750 }
5751
5752 // Add periodic models
5753 int n1=n-r[1]-r[2]-r[3];
5754 for (int i=0; i<2; ++i) {
5755 int period=0;
5756 double score=0;
5757 int t=0;
5758 for (int j=5; j<NR && t<n1; ++j) {
5759 const double s=r[j]/(256.0+n1-t);
5760 if (s>score) score=s, period=j;
5761 t+=r[j];
5762 }
5763 if (period>4 && score>0.1) {
5764 method+="c0,0,"+itos(999+period)+",255i1";
5765 if (period<=255)
5766 method+="c0,"+itos(period)+"i1";
5767 n1-=r[period];
5768 r[period]=0;
5769 }
5770 else
5771 break;
5772 }
5773 method+="c0,2,0,255i1c0,3,0,0,255i1c0,4,0,0,0,255i1mm16ts19t0";
5774 }
5775 }
5776
5777 // Compress
5778 string config;
5779 int args[9]={0};
5780 try {
5781
5782 // Get config
5783 config=makeConfig(method.c_str(), args);
5784 assert(n<=(0x100000u<<args[0])-4096);
5785
5786 // Compress in to out using config
5787 libzpaq::Compressor co;
5788 co.setOutput(out);
5789 #ifdef DEBUG
5790 if (!fragile) co.setVerify(true);
5791 #endif
5792 StringBuffer pcomp_cmd;
5793 if (!fragile) co.writeTag();
5794 co.startBlock(config.c_str(), args, &pcomp_cmd);
5795 string cs=itos(n);
5796 if (method[0]!='s') cs+=" jDC\x01";
5797 if (comment) cs=comment;
5798 co.startSegment(filename, cs.c_str());
5799 if (args[1]>=1 && args[1]<=7 && args[1]!=4) { // LZ77 or BWT
5800 LZBuffer lz(*in, args);
5801 co.setInput(&lz);
5802 co.compress();
5803 }
5804 else { // compress with e8e9 or no preprocessing
5805 if (args[1]>=4 && args[1]<=7)
5806 e8e9(in->data(), in->size());
5807 co.setInput(in);
5808 co.compress();
5809 }
5810 in->reset();
5811 #ifdef DEBUG // verify pre-post processing are inverses
5812 if (fragile)
5813 co.endSegment(0);
5814 else {
5815 int64_t outsize;
5816 const char* sha1result=co.endSegmentChecksum(&outsize);
5817 assert(sha1result);
5818 assert(sha1ptr);
5819 if (memcmp(sha1result, sha1ptr, 20)!=0) {
5820 fprintf(stderr, "pre size=%d post size=%1.0f method=%s\n",
5821 n, double(outsize), method.c_str());
5822 error("Pre/post-processor test failed");
5823 }
5824 }
5825 #else
5826 co.endSegment(sha1ptr);
5827 #endif
5828 co.endBlock();
5829 }
5830 catch(std::exception& e) {
5831 fprintf(con, "Compression error %s\n", e.what());
5832 fprintf(con, "\nconfig:\n%s\n", config.c_str());
5833 fprintf(con, "\nmethod=%s\n", method.c_str());
5834 for (int i=0; i<9; ++i)
5835 fprintf(con, "args[%d] = $%d = %d\n", i, i+1, args[i]);
5836 error("compression error");
5837 }
5838 return method;
5839 }
5840
5841 // A CompressJob is a queue of blocks to compress and write to the archive.
5842 // Each block cycles through states EMPTY, FILLING, FULL, COMPRESSING,
5843 // COMPRESSED, WRITING. The main thread waits for EMPTY buffers and
5844 // fills them. A set of compressThreads waits for FULL threads and compresses
5845 // them. A writeThread waits for COMPRESSED buffers at the front
5846 // of the queue and writes and removes them.
5847
5848 // Buffer queue element
5849 struct CJ {
5850 enum {EMPTY, FULL, COMPRESSING, COMPRESSED, WRITING} state;
5851 StringBuffer in; // uncompressed input
5852 WriteBuffer out; // compressed output
5853 string filename; // to write in filename field
5854 string method; // compression level or "" to mark end of data
5855 int type; // redundancy*4 + exe*2 + text
5856 Semaphore full; // 1 if in is FULL of data ready to compress
5857 Semaphore compressed; // 1 if out contains COMPRESSED data
CJCJ5858 CJ(): state(EMPTY), type(512) {}
5859 };
5860
5861 // Instructions to a compression job
5862 class CompressJob {
5863 public:
5864 Mutex mutex; // protects state changes
5865 private:
5866 int job; // number of jobs
5867 CJ* q; // buffer queue
5868 unsigned qsize; // number of elements in q
5869 int front; // next to remove from queue
5870 libzpaq::Writer* out; // archive
5871 Semaphore empty; // number of empty buffers ready to fill
5872 Semaphore compressors; // number of compressors available to run
5873 public:
5874 friend ThreadReturn compressThread(void* arg);
5875 friend ThreadReturn writeThread(void* arg);
CompressJob(int threads,int buffers,libzpaq::Writer * f)5876 CompressJob(int threads, int buffers, libzpaq::Writer* f):
5877 job(0), q(0), qsize(buffers), front(0), out(f) {
5878 q=new CJ[buffers];
5879 if (!q) throw std::bad_alloc();
5880 init_mutex(mutex);
5881 empty.init(buffers);
5882 compressors.init(threads);
5883 for (int i=0; i<buffers; ++i) {
5884 q[i].full.init(0);
5885 q[i].compressed.init(0);
5886 }
5887 }
~CompressJob()5888 ~CompressJob() {
5889 for (int i=qsize-1; i>=0; --i) {
5890 q[i].compressed.destroy();
5891 q[i].full.destroy();
5892 }
5893 compressors.destroy();
5894 empty.destroy();
5895 destroy_mutex(mutex);
5896 delete[] q;
5897 }
5898 void write(StringBuffer& s, const char* filename, string method,
5899 int hits=-1);
5900 vector<int> csize; // compressed block sizes
5901 };
5902
5903 // Write s at the back of the queue. Signal end of input with method=""
write(StringBuffer & s,const char * fn,string method,int type)5904 void CompressJob::write(StringBuffer& s, const char* fn, string method,
5905 int type) {
5906 for (unsigned k=(method=="")?qsize:1; k>0; --k) {
5907 empty.wait();
5908 lock(mutex);
5909 unsigned i, j;
5910 for (i=0; i<qsize; ++i) {
5911 if (q[j=(i+front)%qsize].state==CJ::EMPTY) {
5912 q[j].filename=fn?fn:"";
5913 q[j].method=method;
5914 q[j].type=type;
5915 q[j].in.reset();
5916 q[j].in.swap(s);
5917 q[j].state=CJ::FULL;
5918 q[j].full.signal();
5919 break;
5920 }
5921 }
5922 release(mutex);
5923 assert(i<qsize); // queue should not be full
5924 }
5925 }
5926
5927 // Global progress indicator
5928 volatile int64_t total_size=0; // number of input bytes to process
5929 volatile int64_t bytes_processed=0; // bytes compressed or decompressed
5930 volatile int64_t bytes_output=0; // output bytes compressed
5931
5932 // Compress data in the background, one per buffer
compressThread(void * arg)5933 ThreadReturn compressThread(void* arg) {
5934 CompressJob& job=*(CompressJob*)arg;
5935 int jobNumber=0;
5936 try {
5937
5938 // Get job number = assigned position in queue
5939 lock(job.mutex);
5940 jobNumber=job.job++;
5941 assert(jobNumber>=0 && jobNumber<int(job.qsize));
5942 CJ& cj=job.q[jobNumber];
5943 release(job.mutex);
5944
5945 // Work until done
5946 while (true) {
5947 cj.full.wait();
5948 lock(job.mutex);
5949
5950 // Check for end of input
5951 if (cj.method=="") {
5952 cj.compressed.signal();
5953 release(job.mutex);
5954 return 0;
5955 }
5956
5957 // Compress
5958 assert(cj.state==CJ::FULL);
5959 cj.state=CJ::COMPRESSING;
5960 int insize=cj.in.size(), start=0, frags=0;
5961 if (insize>=8 && size(cj.filename)==28
5962 && cj.filename.substr(0, 3)=="jDC" && cj.filename[17]=='d') {
5963 const char* p=cj.in.c_str()+insize-8;
5964 start=btoi(p);
5965 frags=btoi(p);
5966 if (!start)
5967 start=atoi(cj.filename.c_str()+18);
5968 }
5969 release(job.mutex);
5970 int64_t now=mtime();
5971 job.compressors.wait();
5972 string m=compressBlock(&cj.in, &cj.out, cj.method, cj.filename.c_str(),
5973 0, cj.type);
5974 job.compressors.signal();
5975 lock(job.mutex);
5976 bytes_processed+=insize-8-4*frags;
5977 bytes_output+=cj.out.size();
5978 if (quiet<MAX_QUIET) {
5979 int64_t eta=(mtime()-global_start+0.0)
5980 *(total_size-bytes_processed)/(bytes_processed+0.5)/1000.0;
5981 if (bytes_processed>0)
5982 fprintf(con, "%d:%02d:%02d",
5983 int(eta/3600), int(eta/60%60), int(eta%60));
5984 if (quiet==MAX_QUIET-1) {
5985 fprintf(con, " to go: %1.6f -> %1.6f MB (%5.2f%%) \r",
5986 bytes_processed/1000000.0, bytes_output/1000000.0,
5987 (bytes_processed+0.5)*100.0/(total_size+0.5));
5988 fflush(con);
5989 }
5990 else {
5991 fprintf(con,
5992 " [%d-%d] %d -> %d (%1.2fs), %d%c %s\n",
5993 start, start+frags-1,
5994 insize, int(cj.out.size()), (mtime()-now)*0.001,
5995 cj.type/4, " teb"[cj.type&3], m.c_str());
5996 }
5997 }
5998 cj.in.reset();
5999 cj.state=CJ::COMPRESSED;
6000 cj.compressed.signal();
6001 release(job.mutex);
6002 }
6003 }
6004 catch (std::exception& e) {
6005 fprintf(stderr, "zpaq exiting from job %d: %s\n", jobNumber+1, e.what());
6006 exit(1);
6007 }
6008 return 0;
6009 }
6010
6011 // Write compressed data to the archive in the background
writeThread(void * arg)6012 ThreadReturn writeThread(void* arg) {
6013 CompressJob& job=*(CompressJob*)arg;
6014 try {
6015
6016 // work until done
6017 while (true) {
6018
6019 // wait for something to write
6020 CJ& cj=job.q[job.front]; // no other threads move front
6021 cj.compressed.wait();
6022
6023 // Quit if end of input
6024 lock(job.mutex);
6025 if (cj.method=="") {
6026 release(job.mutex);
6027 return 0;
6028 }
6029
6030 // Write to archive
6031 assert(cj.state==CJ::COMPRESSED);
6032 cj.state=CJ::WRITING;
6033 job.csize.push_back(cj.out.size());
6034 int outsize=cj.out.size();
6035 if (outsize>0) {
6036 release(job.mutex);
6037 cj.out.save(job.out);
6038 cj.out.reset();
6039 lock(job.mutex);
6040 }
6041 cj.state=CJ::EMPTY;
6042 job.front=(job.front+1)%job.qsize;
6043 job.empty.signal();
6044 release(job.mutex);
6045 }
6046 }
6047 catch (std::exception& e) {
6048 fprintf(stderr, "zpaq exiting from writeThread: %s\n", e.what());
6049 exit(1);
6050 }
6051 return 0;
6052 }
6053
6054 // Write a ZPAQ compressed JIDAC block header. Output size should not
6055 // depend on input data.
writeJidacHeader(libzpaq::Writer * out,int64_t date,int64_t cdata,unsigned htsize)6056 void writeJidacHeader(libzpaq::Writer *out, int64_t date,
6057 int64_t cdata, unsigned htsize) {
6058 if (!out) return;
6059 assert(date>=19700000000000LL && date<30000000000000LL);
6060 StringBuffer is;
6061 is+=ltob(cdata);
6062 compressBlock(&is, out, "0",
6063 ("jDC"+itos(date, 14)+"c"+itos(htsize, 10)).c_str());
6064 }
6065
6066 // Maps sha1 -> fragment ID in ht with known size
6067 class HTIndex {
6068 enum {N=1<<22}; // size of hash table t
6069 vector<HT>& htr; // reference to ht
6070 vector<vector<unsigned> > t; // sha1 prefix -> list of indexes
6071 unsigned htsize; // number of IDs in t
6072
6073 // Compuate a hash index for sha1[20]
hash(const unsigned char * sha1)6074 unsigned hash(const unsigned char* sha1) {
6075 return (sha1[0]|(sha1[1]<<8)|(sha1[2]<<16))&(N-1);
6076 }
6077
6078 public:
HTIndex(vector<HT> & r)6079 HTIndex(vector<HT>& r): htr(r), t(N), htsize(0) {
6080 update();
6081 }
6082
6083 // Find sha1 in ht. Return its index or 0 if not found.
find(const char * sha1)6084 unsigned find(const char* sha1) {
6085 vector<unsigned>& v=t[hash((const unsigned char*)sha1)];
6086 for (unsigned i=0; i<v.size(); ++i)
6087 if (memcmp(sha1, htr[v[i]].sha1, 20)==0)
6088 return v[i];
6089 return 0;
6090 }
6091
6092 // Update index of ht. Do not index if fragment size is unknown.
update()6093 void update() {
6094 for (; htsize<htr.size(); ++htsize)
6095 if (htr[htsize].csize!=HT_BAD && htr[htsize].usize>=0)
6096 t[hash(htr[htsize].sha1)].push_back(htsize);
6097 }
6098 };
6099
6100 // Sort by sortkey, then by full path
compareFilename(DTMap::iterator ap,DTMap::iterator bp)6101 bool compareFilename(DTMap::iterator ap, DTMap::iterator bp) {
6102 if (ap->second.sortkey!=bp->second.sortkey)
6103 return ap->second.sortkey<bp->second.sortkey;
6104 return ap->first<bp->first;
6105 }
6106
6107 // For writing to two archives at once
6108 struct WriterPair: public libzpaq::Writer {
6109 libzpaq::Writer *a, *b;
putWriterPair6110 void put(int c) {
6111 if (a) a->put(c);
6112 if (b) b->put(c);
6113 }
writeWriterPair6114 void write(const char* buf, int n) {
6115 if (a) a->write(buf, n);
6116 if (b) b->write(buf, n);
6117 }
WriterPairWriterPair6118 WriterPair(): a(0), b(0) {}
6119 };
6120
6121 // Add or delete files from archive. Return 1 if error else 0.
add()6122 int Jidac::add() {
6123
6124 // Read archive (preferred) or index into ht, dt, ver.
6125 int errors=0;
6126 int64_t header_pos=0; // end of archive
6127 int64_t index_pos=0; // end of index
6128 const string part1=subpart(archive, 1);
6129 const string part0=subpart(archive, 0);
6130 if (exists(part1)) {
6131 if (part0!=part1 && exists(part0)) { // compare archive with index
6132 Jidac jidac(*this);
6133 header_pos=read_archive(&errors);
6134 index_pos=jidac.read_archive(&errors, part0.c_str());
6135 if (index_pos+dhsize!=header_pos || ver.size()!=jidac.ver.size()) {
6136 fprintf(stderr, "Index ");
6137 printUTF8(part0.c_str(), stderr);
6138 fprintf(stderr, " shows %1.0f bytes in %d versions\n"
6139 " but archive has %1.0f bytes in %d versions.\n",
6140 index_pos+dhsize+0.0, size(jidac.ver)-1,
6141 header_pos+0.0, size(ver)-1);
6142 error("index does not match multi-part archive");
6143 }
6144 }
6145 else { // archive with no index
6146 header_pos=read_archive(&errors);
6147 index_pos=header_pos-dhsize;
6148 }
6149 }
6150 else if (exists(part0)) { // read index of remote archive
6151 index_pos=read_archive(&errors, part0.c_str());
6152 if (dcsize!=0) error("index contains data");
6153 dcsize=dhsize; // assumed
6154 header_pos=index_pos+dhsize;
6155 if (quiet<MAX_QUIET) {
6156 printUTF8(part0.c_str(), con);
6157 fprintf(con, ": assuming %1.0f bytes in %d versions\n",
6158 dhsize+index_pos+0.0, size(ver)-1);
6159 }
6160 }
6161
6162 // Set method and block size
6163 if (method=="") { // set default method
6164 if (dhsize>0 && dcsize==0) method="i"; // index
6165 else method="1"; // archive
6166 }
6167 if (size(method)==1) { // set default blocksize
6168 if (method[0]>='2' && method[0]<='9') method+="6";
6169 else method+="4";
6170 }
6171 if (command=="-add" && quiet<MAX_QUIET)
6172 fprintf(con, "Compressing with -method %s\n", method.c_str());
6173 if (strchr("0123456789xsi", method[0])==0)
6174 error("-method must begin with 0..5, x, s, or i");
6175 assert(size(method)>=2);
6176 unsigned blocksize=(1u<<(20+atoi(method.c_str()+1)))-4096;
6177 if (fragment<0 || fragment>19 || (1u<<(12+fragment))>blocksize)
6178 error("fragment size too large");
6179 if (command=="-add") { // don't mix archives and indexes
6180 if (method[0]=='i' && dcsize>0) error("archive is not an index");
6181 if (method[0]!='i' && dcsize!=dhsize) error("archive is an index");
6182 }
6183
6184 // Make list of files to add or delete
6185 read_args(command!="-delete");
6186
6187 // Sort the files to be added by filename extension and decreasing size
6188 vector<DTMap::iterator> vf;
6189 unsigned deletions=0;
6190 total_size=0;
6191 for (DTMap::iterator p=dt.begin(); p!=dt.end(); ++p) {
6192 if (p->second.edate && (force || p->second.dtv.size()==0
6193 || p->second.edate!=p->second.dtv.back().date
6194 || p->second.esize!=p->second.dtv.back().size)) {
6195 total_size+=p->second.esize;
6196
6197 // Key by first 5 bytes of filename extension, case insensitive
6198 int sp=0; // sortkey byte position
6199 for (string::const_iterator q=p->first.begin(); q!=p->first.end(); ++q){
6200 uint64_t c=*q&255;
6201 if (c>='A' && c<='Z') c+='a'-'A';
6202 if (c=='/') sp=0, p->second.sortkey=0;
6203 else if (c=='.') sp=8, p->second.sortkey=0;
6204 else if (sp>3) p->second.sortkey+=c<<(--sp*8);
6205 }
6206
6207 // Key by descending size rounded to 16K
6208 int64_t s=p->second.esize>>14;
6209 if (s>=(1<<24)) s=(1<<24)-1;
6210 p->second.sortkey+=(1<<24)-s-1;
6211
6212 vf.push_back(p);
6213 }
6214 if (!nodelete && p->second.written==0 && p->second.edate==0)
6215 ++deletions;
6216 }
6217 std::sort(vf.begin(), vf.end(), compareFilename);
6218
6219 // Test if any files are to be added or deleted
6220 if (vf.size()==0 && deletions==0) {
6221 if (quiet<MAX_QUIET)
6222 fprintf(con, "Archive %s not updated: nothing to add or delete.\n",
6223 archive.c_str());
6224 return errors>0;
6225 }
6226
6227 // Open index to append
6228 WriterPair wp; // wp.a points to output, wp.b to index
6229 Archive index;
6230 if (part0!=part1 && (exists(part0) || !exists(part1))) {
6231 if (method[0]=='s')
6232 error("Cannot update indexed archive in streaming mode");
6233 if (!index.open(part0.c_str(), password, 'w', index_pos))
6234 error("Index open failed");
6235 index_pos=index.tell();
6236 wp.b=&index;
6237 }
6238
6239 // Open archive to append
6240 Archive out;
6241 Counter counter;
6242 if (archive=="")
6243 wp.a=&counter;
6244 else if (part0!=part1 && exists(part0) && !exists(part1)) { // remote
6245 char salt[32]={0};
6246 if (password) { // get salt from index
6247 index.close();
6248 if (index.open(part0.c_str()) && index.read(salt, 32)==32) {
6249 salt[0]^=0x4d;
6250 index.close();
6251 }
6252 else error("cannot read salt from index");
6253 if (!index.open(part0.c_str(), password, 'w'))
6254 error("index reopen failed");
6255 }
6256 string part=subpart(archive, ver.size());
6257 if (quiet<MAX_QUIET) {
6258 fprintf(con, "Creating ");
6259 printUTF8(part.c_str(), con);
6260 fprintf(con, " dated %s assuming %1.0f prior bytes\n",
6261 dateToString(date).c_str(), header_pos+0.0);
6262 }
6263 if (exists(part)) error("output archive part exists");
6264 if (!out.open(part.c_str(), password, 'w', header_pos, header_pos, salt))
6265 error("Archive open failed");
6266 header_pos=out.tell();
6267 wp.a=&out;
6268 }
6269 else {
6270 if (!out.open(archive.c_str(), password, 'w', header_pos))
6271 error("Archive open failed");
6272 header_pos=out.tell();
6273 if (quiet<MAX_QUIET) {
6274 fprintf(con, "%s ", (header_pos>32 ? "Updating" : "Creating"));
6275 printUTF8(archive.c_str(), con);
6276 fprintf(con, " version %d at %s\n",
6277 size(ver), dateToString(date).c_str());
6278 }
6279 wp.a=&out;
6280 }
6281 if (method[0]=='i') { // create index
6282 wp.b=wp.a;
6283 wp.a=0;
6284 }
6285 int64_t inputsize=0; // total input size
6286
6287 // Append in streaming mode. Each file is a separate block. Large files
6288 // are split into blocks of size blocksize.
6289 if (method[0]=='s' && command=="-add") {
6290 StringBuffer sb(blocksize+4096-128);
6291 int64_t offset=archive=="" ? 0 : out.tell();
6292 for (unsigned fi=0; fi<vf.size(); ++fi) {
6293 DTMap::iterator p=vf[fi];
6294 if (p->first.size()>0 && p->first[p->first.size()-1]!='/') {
6295 int64_t start=mtime();
6296 InputFile in;
6297 if (in.open(p->first.c_str())) {
6298 int64_t i=0;
6299 while (true) {
6300 int c=in.get();
6301 if (c!=EOF) ++i, sb.put(c);
6302 if (c==EOF || sb.size()==blocksize) {
6303 string filename="";
6304 string comment=itos(sb.size());
6305 if (i<=blocksize) {
6306 filename=p->first;
6307 comment+=" "+itos(p->second.edate);
6308 if ((p->second.eattr&255)>0) {
6309 comment+=" ";
6310 comment+=char(p->second.eattr&255);
6311 comment+=itos(p->second.eattr>>8);
6312 }
6313 }
6314 compressBlock(&sb, &wp, method, filename.c_str(),
6315 comment.c_str());
6316 assert(sb.size()==0);
6317 }
6318 if (c==EOF) break;
6319 }
6320 in.close();
6321 inputsize+=i;
6322 int64_t newoffset=archive=="" ? counter.pos : out.tell();
6323 if (quiet<=i) {
6324 printUTF8(p->first.c_str(), con);
6325 fprintf(con, " %1.0f -> %1.0f in %1.3f sec.\n", double(i),
6326 double(newoffset-offset), 0.001*(mtime()-start));
6327 }
6328 offset=newoffset;
6329 }
6330 else ++errors;
6331 }
6332 }
6333 if (quiet<MAX_QUIET) {
6334 const int64_t outsize=archive=="" ? counter.pos : out.tell();
6335 fprintf(con, "%1.0f + (%1.0f -> %1.0f) = %1.0f\n",
6336 double(header_pos),
6337 double(inputsize),
6338 double(outsize-header_pos),
6339 double(outsize));
6340 }
6341 if (archive!="") out.close();
6342 return errors>0;
6343 }
6344
6345 // Adjust date to maintain sequential order
6346 if (ver.size() && ver.back().date>=date) {
6347 const int64_t newdate=decimal_time(unix_time(ver.back().date)+1);
6348 fprintf(stderr, "Warning: adjusting date from %s to %s\n",
6349 dateToString(date).c_str(), dateToString(newdate).c_str());
6350 assert(newdate>date);
6351 date=newdate;
6352 }
6353
6354 // Build htinv for fast lookups of sha1 in ht
6355 HTIndex htinv(ht);
6356
6357 // reserve space for the header block
6358 const unsigned htsize=ht.size(); // fragments at start of update
6359 writeJidacHeader(&wp, date, -1, htsize);
6360 const int64_t header_end=archive=="" ? counter.pos : out.tell();
6361
6362 // Start compress and write jobs
6363 vector<ThreadID> tid(threads*2-1);
6364 ThreadID wid;
6365 CompressJob job(threads, tid.size(), wp.a);
6366 if (quiet<MAX_QUIET && deletions>0)
6367 fprintf(con, "Deleting %d files.\n", deletions);
6368 if (quiet<MAX_QUIET && size(vf)>0)
6369 fprintf(con,
6370 "Adding %1.6f MB in %d files using %d jobs in %d threads.\n",
6371 total_size/1000000.0, size(vf), size(tid), threads);
6372 for (int i=0; i<size(tid); ++i) run(tid[i], compressThread, &job);
6373 run(wid, writeThread, &job);
6374
6375 // Compress until end of last file
6376 assert(method!="");
6377 const unsigned MIN_FRAGMENT=64<<fragment; // fragment size limits
6378 const unsigned MAX_FRAGMENT=8128<<fragment;
6379 StringBuffer sb(blocksize+4096-128); // block to compress
6380 unsigned frags=0; // number of fragments in sb
6381 unsigned redundancy=0; // estimated bytes that can be compressed out of sb
6382 unsigned text=0; // number of fragents containing text
6383 unsigned exe=0; // number of fragments containing x86 (exe, dll)
6384 const int ON=4; // number of order-1 tables to save
6385 unsigned char o1prev[ON*256]={0}; // last ON order 1 predictions
6386 libzpaq::Array<char> fragbuf(MAX_FRAGMENT);
6387
6388 // For each file to be added
6389 for (unsigned fi=0; fi<vf.size(); ++fi) {
6390 assert(vf[fi]->second.eptr.size()==0);
6391 DTMap::iterator p=vf[fi];
6392 string filename=rename(p->first);
6393
6394 // Skip directory
6395 if (filename!="" && filename[filename.size()-1]=='/') {
6396 if (quiet<=0) {
6397 fprintf(con, "Adding directory ");
6398 printUTF8(p->first.c_str(), con);
6399 fprintf(con, "\n");
6400 }
6401 continue;
6402 }
6403
6404 // Open input file
6405 InputFile in;
6406 if (!in.open(filename.c_str())) { // skip if not found
6407 p->second.edate=0;
6408 lock(job.mutex);
6409 total_size-=p->second.esize;
6410 release(job.mutex);
6411 ++errors;
6412 continue;
6413 }
6414 else if (quiet<=p->second.esize) {
6415 fprintf(con, "%6u ", (unsigned)ht.size());
6416 if (p->second.dtv.size()==0 || p->second.dtv.back().date==0) {
6417 fprintf(con, "Adding %12.0f ", double(p->second.esize));
6418 printUTF8(p->first.c_str(), con);
6419 }
6420 else {
6421 fprintf(con, "Updating %12.0f ", double(p->second.esize));
6422 printUTF8(p->first.c_str(), con);
6423 }
6424 if (p->first!=filename) {
6425 fprintf(con, " from ");
6426 printUTF8(filename.c_str(), con);
6427 }
6428 fprintf(con, "\n");
6429 }
6430
6431 // Read fragments
6432 assert(in.isopen());
6433 for (unsigned fj=0; true; ++fj) {
6434 int c=0; // current byte
6435 int c1=0; // previous byte
6436 unsigned h=0; // rolling hash for finding fragment boundaries
6437 int64_t sz=0; // fragment size;
6438 libzpaq::SHA1 sha1;
6439 unsigned char o1[256]={0};
6440 unsigned hits=0;
6441 while (true) {
6442 c=in.get();
6443 if (c!=EOF) {
6444 if (c==o1[c1]) h=(h+c+1)*314159265u, ++hits;
6445 else h=(h+c+1)*271828182u;
6446 o1[c1]=c;
6447 c1=c;
6448 sha1.put(c);
6449 fragbuf[sz++]=c;
6450 }
6451 if (c==EOF || (h<(1u<<22>>fragment) && sz>=MIN_FRAGMENT)
6452 || sz>=MAX_FRAGMENT)
6453 break;
6454 }
6455 assert(sz<=MAX_FRAGMENT);
6456 inputsize+=sz;
6457
6458 // Look for matching fragment
6459 char sh[20];
6460 assert(uint64_t(sz)==sha1.usize());
6461 memcpy(sh, sha1.result(), 20);
6462 unsigned htptr=htinv.find(sh);
6463 if (htptr==0) { // not matched
6464
6465 // Analyze fragment for redundancy, x86, text.
6466 // Test for text: letters, digits, '.' and ',' followed by spaces
6467 // and no invalid UTF-8.
6468 // Test for exe: 139 (mov reg, r/m) in lots of contexts.
6469 // 4 tests for redundancy, measured as hits/sz. Take the highest of:
6470 // 1. Successful prediction count in o1.
6471 // 2. Non-uniform distribution in o1 (counted in o2).
6472 // 3. Fraction of zeros in o1 (bytes never seen).
6473 // 4. Fraction of matches between o1 and previous o1 (o1prev).
6474 int text1=0, exe1=0;
6475 int64_t h1=sz;
6476 unsigned char o1ct[256]={0}; // counts of bytes in o1
6477 static const unsigned char dt[256]={ // 32768/((i+1)*204)
6478 160,80,53,40,32,26,22,20,17,16,14,13,12,11,10,10,
6479 9, 8, 8, 8, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
6480 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
6481 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
6482 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
6483 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6484 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6485 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6486 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6487 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
6488 for (int i=0; i<256; ++i) {
6489 if (o1ct[o1[i]]<255) h1-=(sz*dt[o1ct[o1[i]]++])>>15;
6490 if (o1[i]==' ' && (isalnum(i) || i=='.' || i==',')) ++text1;
6491 if (o1[i] && (i<9 || i==11 || i==12 || (i>=14 && i<=31) || i>=240))
6492 --text1;
6493 if (i>=192 && i<240 && o1[i] && (o1[i]<128 || o1[i]>=192))
6494 --text1;
6495 if (o1[i]==139) ++exe1;
6496 }
6497 text1=(text1>=3);
6498 exe1=(exe1>=5);
6499 if (sz>0) h1=h1*h1/sz; // Test 2: near 0 if random.
6500 unsigned h2=h1;
6501 if (h2>hits) hits=h2;
6502 h2=o1ct[0]*sz/256; // Test 3: bytes never seen or that predict 0.
6503 if (h2>hits) hits=h2;
6504 h2=0;
6505 for (int i=0; i<256*ON; ++i) // Test 4: compare to previous o1.
6506 h2+=o1prev[i]==o1[i&255];
6507 h2=h2*sz/(256*ON);
6508 if (h2>hits) hits=h2;
6509 if (hits>sz) hits=sz;
6510
6511 // Start a new block if the current block is almost full, or at
6512 // the start of a file that won't fit or doesn't share mutual
6513 // information with the current block.
6514 bool newblock=false;
6515 if (frags>0 && fj==0) {
6516 const unsigned newsize=sb.size()+p->second.esize
6517 +(p->second.esize>>(8+fragment))+4096+frags*4; // size if added
6518 if (newsize>blocksize/4 && redundancy<sb.size()/128) newblock=true;
6519 if (newblock) { // test for mutual information
6520 unsigned ct=0;
6521 for (unsigned i=0; i<256*ON; ++i)
6522 if (o1prev[i] && o1prev[i]==o1[i&255]) ++ct;
6523 if (ct>ON*2) newblock=false;
6524 }
6525 if (newsize>=blocksize) newblock=true; // won't fit?
6526 }
6527 if (sb.size()+sz+80+frags*4>=blocksize) newblock=true; // full?
6528 if (frags<1) newblock=false; // block is empty?
6529
6530 // Pad sb with fragment size list unless fragile, then compress
6531 if (newblock) {
6532 assert(sb.size()>0);
6533 assert(frags>0);
6534 assert(frags<ht.size());
6535 for (unsigned i=ht.size()-frags; !fragile && i<ht.size(); ++i)
6536 sb+=itob(ht[i].usize); // list of frag sizes
6537 sb+=itob(0); // omit first frag ID to make block movable
6538 sb+=itob(frags*!fragile); // number of frags
6539 job.write(sb,
6540 ("jDC"+itos(date, 14)+"d"+itos(ht.size()-frags, 10)).c_str(),
6541 method,
6542 redundancy/(sb.size()/256+1)*4+(exe>frags)*2+(text>frags));
6543 assert(sb.size()==0);
6544 ht[ht.size()-frags].csize=-1; // mark block start
6545 frags=redundancy=text=exe=0;
6546 memset(o1prev, 0, sizeof(o1prev));
6547 }
6548
6549 // Append fragbuf to sb and update block statistics
6550 sb.write(&fragbuf[0], sz);
6551 ++frags;
6552 redundancy+=hits;
6553 exe+=exe1*4;
6554 text+=text1*2;
6555 if (sz>=MIN_FRAGMENT) {
6556 memmove(o1prev, o1prev+256, 256*(ON-1));
6557 memcpy(o1prev+256*(ON-1), o1, 256);
6558 }
6559 } // end if not matched
6560
6561 // Point file to this fragment
6562 if (htptr==0) { // not matched in ht
6563 htptr=ht.size();
6564 ht.push_back(HT(sh, sz, 0));
6565 htinv.update();
6566 }
6567 else {
6568 lock(job.mutex);
6569 bytes_processed+=sz;
6570 release(job.mutex);
6571 }
6572 p->second.eptr.push_back(htptr);
6573
6574 if (c==EOF)
6575 break;
6576 } // end for each fragment
6577 in.close();
6578 } // end for each file
6579
6580 // Compress any remaining data
6581 if (frags>0) {
6582 assert(frags<ht.size());
6583 for (unsigned i=ht.size()-frags; !fragile && i<ht.size(); ++i)
6584 sb+=itob(ht[i].usize);
6585 sb+=itob(0);
6586 sb+=itob(frags*!fragile);
6587 job.write(sb,
6588 ("jDC"+itos(date, 14)+"d"+itos(ht.size()-frags, 10)).c_str(),
6589 method,
6590 redundancy/(sb.size()/256+1)*4+(exe>frags)*2+(text>frags));
6591 assert(sb.size()==0);
6592 ht[ht.size()-frags].csize=-1;
6593 }
6594
6595 // Wait for jobs to finish
6596 assert(sb.size()==0);
6597 job.write(sb, 0, ""); // signal end of input
6598 for (int i=0; i<size(tid); ++i)
6599 join(tid[i]);
6600 join(wid);
6601
6602 // Fill in compressed sizes in ht
6603 unsigned j=0;
6604 for (unsigned i=htsize; i<ht.size() && j<job.csize.size(); ++i)
6605 if (ht[i].csize==-1)
6606 ht[i].csize=job.csize[j++];
6607 assert(j==job.csize.size());
6608
6609 // Append compressed fragment tables to archive
6610 if (quiet<MAX_QUIET-1) {
6611 fprintf(con, "Updating index with %d files, %d blocks, %d fragments.\n",
6612 int(vf.size()), j, int(ht.size()-htsize));
6613 }
6614 int64_t cdatasize=(archive=="" ? counter.pos : out.tell())-header_end;
6615 StringBuffer is;
6616 unsigned block_start=0;
6617 for (unsigned i=htsize; i<=ht.size(); ++i) {
6618 if ((i==ht.size() || ht[i].csize>0) && is.size()>0) { // write a block
6619 assert(block_start>=htsize && block_start<i);
6620 compressBlock(&is, &wp, "0",
6621 ("jDC"+itos(date, 14)+"h"+itos(block_start, 10)).c_str());
6622 assert(is.size()==0);
6623 }
6624 if (i<ht.size()) {
6625 if (ht[i].csize) is+=itob(ht[i].csize), block_start=i;
6626 is+=string(ht[i].sha1, ht[i].sha1+20)+itob(ht[i].usize);
6627 }
6628 }
6629 assert(is.size()==0);
6630
6631 // Append compressed index to archive
6632 int dtcount=0;
6633 int archiveResets=0;
6634 for (DTMap::iterator p=dt.begin(); p!=dt.end();) {
6635 const DT& dtr=p->second;
6636
6637 // Remove file if external does not exist and is currently in archive
6638 if (!nodelete && dtr.written==0 && !dtr.edate && dtr.dtv.size()
6639 && dtr.dtv.back().date) {
6640 is+=ltob(0)+p->first+'\0';
6641 if (quiet<=dtr.dtv.back().size) {
6642 fprintf(con, "Removing %12.0f ", dtr.dtv.back().size+0.0);
6643 printUTF8(p->first.c_str(), con);
6644 fprintf(con, "\n");
6645 }
6646 }
6647
6648 // Update file if compressed and anything differs
6649 if (dtr.edate && (force || dtr.dtv.size()==0
6650 || dtr.edate!=dtr.dtv.back().date
6651 || dtr.esize!=dtr.dtv.back().size)) {
6652
6653 // Reset archive bit
6654 #ifndef unix
6655 if (resetArchive && (dtr.eattr&255)=='w'
6656 && ((dtr.eattr>>8)&FILE_ATTRIBUTE_ARCHIVE)) {
6657 p->second.eattr&=~(int64_t(FILE_ATTRIBUTE_ARCHIVE)<<8);
6658 SetFileAttributes(utow(rename(p->first).c_str()).c_str(),
6659 dtr.eattr>>8);
6660 ++archiveResets;
6661 }
6662 #endif
6663
6664 // Append to index if anything changed
6665 if (dtr.dtv.size()==0 // new file
6666 || dtr.edate!=dtr.dtv.back().date // date change
6667 || (dtr.eattr && dtr.dtv.back().attr
6668 && dtr.eattr!=dtr.dtv.back().attr) // attr change
6669 || dtr.esize!=dtr.dtv.back().size // size change
6670 || dtr.eptr!=dtr.dtv.back().ptr) { // content change
6671 is+=ltob(dtr.edate)+p->first+'\0';
6672 if ((dtr.eattr&255)=='u') { // unix attributes
6673 is+=itob(3);
6674 is.put('u');
6675 is.put(dtr.eattr>>8&255);
6676 is.put(dtr.eattr>>16&255);
6677 }
6678 else if ((dtr.eattr&255)=='w') { // windows attributes
6679 is+=itob(5);
6680 is.put('w');
6681 is+=itob(dtr.eattr>>8);
6682 }
6683 else is+=itob(0);
6684 is+=itob(size(dtr.eptr)); // list of frag pointers
6685 for (int i=0; i<size(dtr.eptr); ++i)
6686 is+=itob(dtr.eptr[i]);
6687 }
6688 }
6689 ++p;
6690 if (is.size()>16000 || (is.size()>0 && p==dt.end())) {
6691 compressBlock(&is, &wp, "1",
6692 ("jDC"+itos(date)+"i"+itos(++dtcount, 10)).c_str());
6693 assert(is.size()==0);
6694 }
6695 if (p==dt.end()) break;
6696 }
6697
6698 // Back up and write the header
6699 int64_t archive_end=0;
6700 if (archive=="")
6701 archive_end=counter.pos;
6702 else {
6703 archive_end=out.tell();
6704 out.seek(header_pos, SEEK_SET);
6705 if (wp.b) index.seek(index_pos, SEEK_SET);
6706 writeJidacHeader(wp.a, date, cdatasize, htsize);
6707 if (wp.b) writeJidacHeader(wp.b, date, 0, htsize);
6708 }
6709 if (quiet<MAX_QUIET)
6710 fprintf(con, "\n%1.0f + (%1.0f -> %1.0f) = %1.0f\n",
6711 double(header_pos),
6712 double(inputsize),
6713 double(archive_end-header_pos),
6714 double(archive_end));
6715 if (archiveResets)
6716 fprintf(con, "%d file archive bits reset.\n", archiveResets);
6717 out.close();
6718 index.close();
6719 return errors>0;
6720 }
6721
6722 /////////////////////////////// extract ///////////////////////////////
6723
6724 // Return true if the internal and external file contents are equal
equal(DTMap::const_iterator p)6725 bool Jidac::equal(DTMap::const_iterator p) {
6726 if (p->second.dtv.size()==0 || p->second.dtv.back().date==0)
6727 return p->second.edate==0; // true if neither file exists
6728 if (p->second.edate==0) return false; // external does not exist
6729 assert(p->second.dtv.size()>0);
6730 if (p->second.dtv.back().size!=p->second.esize) return false;
6731 if (p->first!="" && p->first[p->first.size()-1]=='/') return true;
6732 InputFile in;
6733 in.open(rename(p->first).c_str());
6734 if (!in.isopen()) return false;
6735 libzpaq::SHA1 sha1;
6736 for (unsigned i=0; i<p->second.dtv.back().ptr.size(); ++i) {
6737 unsigned f=p->second.dtv.back().ptr[i];
6738 if (f<1 || f>=ht.size() || ht[f].csize==HT_BAD) return false;
6739 for (int j=ht[f].usize; j>0; --j) {
6740 int c=in.get();
6741 if (c==EOF) return false;
6742 sha1.put(c);
6743 }
6744 if (memcmp(sha1.result(), ht[f].sha1, 20)!=0) return false;
6745 }
6746 if (in.get()!=EOF) return false;
6747 return true;
6748 }
6749
6750 // An extract job is a set of blocks with at least one file pointing to them.
6751 // Blocks are extracted in separate threads, set READY -> WORKING.
6752 // A block is extracted to memory up to the last fragment that has a file
6753 // pointing to it. Then the checksums are verified. Then for each file
6754 // pointing to the block, each of the fragments that it points to within
6755 // the block are written in order.
6756
6757 struct Block { // list of fragments
6758 int64_t offset; // location in archive
6759 vector<DTMap::iterator> files; // list of files pointing here
6760 unsigned start; // index in ht of first fragment
6761 int size; // number of fragments to decompress
6762 bool streaming; // must decompress sequentially?
6763 enum {READY, WORKING, GOOD, BAD} state;
BlockBlock6764 Block(unsigned s, int64_t o):
6765 offset(o), start(s), size(0), streaming(false), state(READY) {}
6766 };
6767
6768 struct ExtractJob { // list of jobs
6769 Mutex mutex; // protects state
6770 Mutex write_mutex; // protects writing to disk
6771 int job; // number of jobs started
6772 int next; // next block to extract (usually)
6773 vector<Block> block; // list of blocks to extract
6774 Jidac& jd; // what to extract
6775 OutputFile outf; // currently open output file
6776 DTMap::iterator lastdt; // currently open output file name
6777 double maxMemory; // largest memory used by any block (test mode)
ExtractJobExtractJob6778 ExtractJob(Jidac& j):
6779 job(0), next(0), jd(j), lastdt(j.dt.end()), maxMemory(0) {
6780 init_mutex(mutex);
6781 init_mutex(write_mutex);
6782 }
~ExtractJobExtractJob6783 ~ExtractJob() {
6784 destroy_mutex(mutex);
6785 destroy_mutex(write_mutex);
6786 }
6787 };
6788
6789 // Decompress blocks in a job until none are READY
decompressThread(void * arg)6790 ThreadReturn decompressThread(void* arg) {
6791 ExtractJob& job=*(ExtractJob*)arg;
6792 int jobNumber=0;
6793 Archive in;
6794 const bool istest=job.jd.command=="-test"; // do not open or write files?
6795
6796 // Get job number
6797 lock(job.mutex);
6798 jobNumber=++job.job;
6799 release(job.mutex);
6800
6801 // Open archive for reading
6802 if (!in.open(job.jd.archive.c_str(), job.jd.password)) return 0;
6803 WriteBuffer out;
6804
6805 // Look for next READY job
6806 while (true) {
6807 lock(job.mutex);
6808 unsigned i, k=0;
6809 for (i=0; i<job.block.size(); ++i) {
6810 k=i+job.next;
6811 if (k>=job.block.size()) k-=job.block.size();
6812 assert(k<job.block.size());
6813 Block& b=job.block[k];
6814 if (b.state==Block::READY && b.size>0 && !b.streaming) {
6815 b.state=Block::WORKING;
6816 break;
6817 }
6818 }
6819 if (i<job.block.size()) job.next=k;
6820 release(job.mutex);
6821 if (i>=job.block.size()) break;
6822 Block& b=job.block[k];
6823
6824 // Get uncompressed size of block
6825 unsigned output_size=0; // minimum size to decompress
6826 unsigned max_size=0; // uncompressed full block size
6827 assert(b.start>0);
6828 int j;
6829 for (j=0; j<b.size; ++j) {
6830 assert(b.start+j<job.jd.ht.size());
6831 assert(job.jd.ht[b.start+j].usize>=0);
6832 assert(j==0 || job.jd.ht[b.start+j].csize==-j);
6833 output_size+=job.jd.ht[b.start+j].usize;
6834 }
6835 max_size=output_size+j*4+8; // uncompressed full block size
6836 for (; b.start+j<job.jd.ht.size() && job.jd.ht[b.start+j].csize<0
6837 && job.jd.ht[b.start+j].csize!=HT_BAD; ++j) {
6838 assert(job.jd.ht[b.start+j].csize==-j);
6839 max_size+=job.jd.ht[b.start+j].usize+4;
6840 }
6841
6842 // Decompress
6843 double mem=0; // how much memory used to decompress
6844 try {
6845 assert(b.start>0);
6846 assert(b.start<job.jd.ht.size());
6847 assert(b.size>0);
6848 assert(b.start+b.size<=job.jd.ht.size());
6849 const int64_t now=mtime();
6850 in.seek(job.jd.ht[b.start].csize, SEEK_SET);
6851 libzpaq::Decompresser d;
6852 d.setInput(&in);
6853 out.reset();
6854 out.setLimit(max_size);
6855 d.setOutput(&out);
6856 libzpaq::SHA1 sha1;
6857 if (job.jd.all) d.setSHA1(&sha1);
6858 if (!d.findBlock(&mem)) error("archive block not found");
6859 if (mem>job.maxMemory) job.maxMemory=mem;
6860 while (d.findFilename()) {
6861 StringWriter comment;
6862 d.readComment(&comment);
6863 if (!job.jd.all && comment.s.size()>=5
6864 && comment.s.substr(comment.s.size()-5)==" jDC\x01") {
6865 while (out.size()<output_size && d.decompress(1<<14));
6866 break;
6867 }
6868 else {
6869 char s[21];
6870 d.decompress();
6871 d.readSegmentEnd(s);
6872 if (job.jd.all && s[0]==1 && memcmp(s+1, sha1.result(), 20))
6873 error("checksum error");
6874 }
6875 }
6876 if (out.size()<output_size)
6877 error("unexpected end of compressed data");
6878 if (quiet<MAX_QUIET-1) {
6879 fprintf(con, "Job %d: [%d..%d] %1.0f -> %d (%1.3f s, %1.3f MB)\n",
6880 jobNumber, b.start, b.start+b.size-1,
6881 double(in.tell()-job.jd.ht[b.start].csize),
6882 size(out), (mtime()-now)*0.001, mem/1000000);
6883 }
6884
6885 // Verify fragment checksums if present
6886 int64_t q=0; // fragment start
6887 for (unsigned j=b.start; j<b.start+b.size; ++j) {
6888 if (!fragile) {
6889 char sha1result[20];
6890 out.sha1(sha1result, q, job.jd.ht[j].usize);
6891 q+=job.jd.ht[j].usize;
6892 if (memcmp(sha1result, job.jd.ht[j].sha1, 20)) {
6893 for (int k=0; k<20; ++k) {
6894 if (job.jd.ht[j].sha1[k]) { // all zeros is OK
6895 lock(job.mutex);
6896 fprintf(stderr,
6897 "Job %d: fragment %d size %d checksum failed\n",
6898 jobNumber, j, job.jd.ht[j].usize);
6899 release(job.mutex);
6900 error("bad checksum");
6901 }
6902 }
6903 }
6904 }
6905 lock(job.mutex);
6906 job.jd.ht[j].csize=EXTRACTED;
6907 release(job.mutex);
6908 }
6909 }
6910
6911 // If out of memory, let another thread try
6912 catch (std::bad_alloc& e) {
6913 lock(job.mutex);
6914 fprintf(stderr, "Job %d killed to save memory\n", jobNumber);
6915 b.state=Block::READY;
6916 release(job.mutex);
6917 in.close();
6918 return 0;
6919 }
6920
6921 // Other errors: assume bad input
6922 catch (std::exception& e) {
6923 lock(job.mutex);
6924 fprintf(stderr, "Job %d: skipping frags %u-%u at offset %1.0f: %s\n",
6925 jobNumber, b.start, b.start+b.size-1,
6926 double(in.tell()), e.what());
6927 release(job.mutex);
6928 continue;
6929 }
6930
6931 // Write the files in dt that point to this block
6932 lock(job.write_mutex);
6933 for (unsigned ip=0; ip<b.files.size(); ++ip) {
6934 DTMap::iterator p=b.files[ip];
6935 DT& dtr=p->second;
6936 if (dtr.written<0 || size(dtr.dtv)==0
6937 || dtr.written>=size(dtr.dtv.back().ptr))
6938 continue; // don't write
6939
6940 // Look for pointers to this block
6941 const vector<unsigned>& ptr=dtr.dtv.back().ptr;
6942 string filename="";
6943 int64_t offset=0; // write offset
6944 for (unsigned j=0; j<ptr.size(); ++j) {
6945 if (ptr[j]<b.start || ptr[j]>=b.start+b.size) {
6946 offset+=job.jd.ht[ptr[j]].usize;
6947 continue;
6948 }
6949
6950 // Close last opened file if different
6951 if (p!=job.lastdt) {
6952 if (job.outf.isopen()) {
6953 assert(job.lastdt!=job.jd.dt.end());
6954 assert(job.lastdt->second.dtv.size()>0);
6955 assert(job.lastdt->second.dtv.back().date);
6956 assert(job.lastdt->second.written
6957 <size(job.lastdt->second.dtv.back().ptr));
6958 job.outf.close();
6959 }
6960 job.lastdt=job.jd.dt.end();
6961 }
6962
6963 // Open file for output
6964 if (job.lastdt==job.jd.dt.end()) {
6965 filename=job.jd.rename(p->first);
6966 assert(!job.outf.isopen());
6967 if (dtr.written==0) {
6968 if (!istest) makepath(filename);
6969 if (quiet<=dtr.dtv.back().size) {
6970 fprintf(con, "Job %d: %s %1.0f ", jobNumber,
6971 (istest ? "testing" : "extracting"),
6972 p->second.dtv.back().size+0.0);
6973 printUTF8(filename.c_str(), con);
6974 fprintf(con, "\n");
6975 }
6976 if (!istest && job.outf.open(filename.c_str())) // new file
6977 job.outf.truncate();
6978 }
6979 else if (!istest)
6980 job.outf.open(filename.c_str()); // update existing file
6981 if (!istest && !job.outf.isopen()) break; // skip file if error
6982 job.lastdt=p;
6983 assert(istest || job.outf.isopen());
6984 }
6985 assert(job.lastdt==p);
6986
6987 // Find block offset of fragment
6988 int64_t q=0; // fragment offset from start of block
6989 for (unsigned k=b.start; k<ptr[j]; ++k)
6990 q+=job.jd.ht[k].usize;
6991 assert(q>=0);
6992 assert(q<=out.size()-job.jd.ht[ptr[j]].usize);
6993
6994 // Write the fragment and any consecutive fragments that follow
6995 assert(offset>=0);
6996 ++dtr.written;
6997 int usize=job.jd.ht[ptr[j]].usize;
6998 while (j+1<ptr.size() && ptr[j+1]==ptr[j]+1
6999 && ptr[j+1]<b.start+b.size) {
7000 ++dtr.written;
7001 assert(dtr.written<=size(ptr));
7002 usize+=job.jd.ht[ptr[++j]].usize;
7003 }
7004 assert(q+usize<=out.size());
7005 if (!istest) out.save(job.outf, offset, q, usize);
7006 offset+=usize;
7007 bytes_processed+=usize;
7008 if (dtr.written==size(ptr)) { // close file
7009 assert(dtr.dtv.size()>0);
7010 assert(dtr.dtv.back().date);
7011 assert(job.lastdt!=job.jd.dt.end());
7012 assert(istest || job.outf.isopen());
7013 if (!istest) {
7014 job.outf.truncate(dtr.dtv.back().size);
7015 job.outf.close(dtr.dtv.back().date, dtr.dtv.back().attr);
7016 }
7017 job.lastdt=job.jd.dt.end();
7018 }
7019 } // end for j
7020 } // end for ip
7021
7022 // Last file
7023 release(job.write_mutex);
7024
7025 // Update display
7026 if (quiet<MAX_QUIET) {
7027 lock(job.mutex);
7028 if (bytes_processed>0) {
7029 int64_t eta=(mtime()-global_start+0.0)
7030 *(total_size-bytes_processed)/(bytes_processed+0.5)/1000.0;
7031 if (bytes_processed>0)
7032 fprintf(con, "%d:%02d:%02d to go: ",
7033 int(eta/3600), int(eta/60%60), int(eta%60)); }
7034 if (quiet<=MAX_QUIET-1) {
7035 fprintf(con, "%1.6f MB (%5.2f%%) %c", bytes_processed/1000000.0,
7036 (bytes_processed+0.5)*100.0/(total_size+0.5),
7037 quiet==MAX_QUIET-1 ? '\r' : '\n');
7038 fflush(con);
7039 }
7040 release(job.mutex);
7041 }
7042 } // end while true
7043
7044 // Last block
7045 in.close();
7046 return 0;
7047 }
7048
7049 // Extract files from archive. If force is true then overwrite
7050 // existing files and set the dates and attributes of exising directories.
7051 // Otherwise create only new files and directories. If command is "-test"
7052 // then don't write files. Return 1 if error else 0.
extract()7053 int Jidac::extract() {
7054 const bool istest=command=="-test";
7055
7056 // Read HT, DT
7057 if (!read_archive())
7058 return 1;
7059
7060 // If force is true then as an optimization, compare marked files by
7061 // content and files and directories by dates and attributes.
7062 // If they are exactly the same then unmark them.
7063 // If they are the same except for dates and attributes then reset them.
7064 read_args(force && !istest);
7065 int count=0;
7066 if (force && !istest) {
7067 for (DTMap::iterator p=dt.begin(); p!=dt.end(); ++p) {
7068 if (p->second.written==0) {
7069 string fn=rename(p->first);
7070 if (equal(p) && p->second.dtv.size()>0 && p->second.edate
7071 && fn!="" && fn[fn.size()-1]!='/') {
7072 if (p->second.dtv.back().date!=p->second.edate
7073 || (p->second.eattr && p->second.dtv.back().attr &&
7074 p->second.eattr!=p->second.dtv.back().attr)) {
7075 if (p->second.esize>=quiet) {
7076 fprintf(con, "Resetting to %s %s: ",
7077 attrToString(p->second.dtv.back().attr).c_str(),
7078 dateToString(p->second.dtv.back().date).c_str());
7079 printUTF8(fn.c_str(), con);
7080 fprintf(con, "\n");
7081 }
7082 OutputFile out;
7083 out.open(fn.c_str());
7084 if (out.isopen()) {
7085 out.close(p->second.dtv.back().date,
7086 p->second.dtv.back().attr);
7087 p->second.written=-1; // unmark if date and attr change OK
7088 ++count;
7089 }
7090 }
7091 else { // dates and attributes equal
7092 if (p->second.esize>=quiet) {
7093 fprintf(con, "Identical: %12.0f ", p->second.esize+0.0);
7094 printUTF8(fn.c_str(), con);
7095 fprintf(con, "\n");
7096 }
7097 p->second.written=-1; // unmark if date and attr matches
7098 ++count;
7099 }
7100 }
7101 }
7102 }
7103 if (quiet<MAX_QUIET && count>0)
7104 fprintf(con, "%d identical files skipped\n", count);
7105 }
7106
7107 // If not force then unmark existing files and directories
7108 else if (!istest) {
7109 for (DTMap::iterator p=dt.begin(); p!=dt.end(); ++p) {
7110 if (p->second.dtv.size() && p->second.dtv.back().date
7111 && p->second.written==0) {
7112 if (exists(rename(p->first))) {
7113 if (quiet<p->second.dtv.back().size) {
7114 fprintf(con, "Skipping existing file: %12.0f ",
7115 p->second.dtv.back().size+0.0);
7116 printUTF8(rename(p->first).c_str(), con);
7117 fprintf(con, "\n");
7118 }
7119 p->second.written=-1;
7120 ++count;
7121 }
7122 }
7123 }
7124 if (quiet<MAX_QUIET && count>0)
7125 fprintf(con, "%d existing files skipped.\n", count);
7126 }
7127
7128 // Map fragments to blocks.
7129 // Mark blocks with unknown or large fragment sizes as streaming.
7130 // If test -all, mark all fragments for extraction.
7131 ExtractJob job(*this);
7132 vector<unsigned> hti(ht.size()); // fragment index -> block index
7133 for (unsigned i=1; i<ht.size(); ++i) {
7134 if (ht[i].csize!=HT_BAD) {
7135 if (ht[i].csize>=0)
7136 job.block.push_back(Block(i, ht[i].csize));
7137 assert(job.block.size()>0);
7138 if (istest && all) ++job.block.back().size;
7139 hti[i]=job.block.size()-1;
7140 if (ht[i].usize<0 || ht[i].usize>(1<<30))
7141 job.block.back().streaming=true;
7142 }
7143 }
7144
7145 // Make a list of files and the number of fragments to extract
7146 // from each block. If the file size is unknown, then mark
7147 // all blocks that it points to as streaming.
7148
7149 total_size=0; // total bytes to be extracted
7150 bytes_processed=0; // total bytes extracted so far
7151 int total_files=0;
7152 for (DTMap::iterator p=dt.begin(); p!=dt.end(); ++p) {
7153 if (p->second.dtv.size() && p->second.dtv.back().date
7154 && p->second.written==0) {
7155 assert(p->second.dtv.size()>0);
7156 for (unsigned i=0; i<p->second.dtv.back().ptr.size(); ++i) {
7157 unsigned j=p->second.dtv.back().ptr[i];
7158 if (j==0 || j>=ht.size() || ht[j].csize==HT_BAD) {
7159 printUTF8(p->first.c_str(), stderr);
7160 fprintf(stderr, ": bad frag IDs, skipping...\n");
7161 continue;
7162 }
7163 assert(j>0 && j<ht.size());
7164 assert(ht.size()==hti.size());
7165 int64_t c=-ht[j].csize;
7166 if (c<0) c=0; // position of fragment in block
7167 j=hti[j]; // block index
7168 assert(j>=0 && j<job.block.size());
7169 if (job.block[j].size<=c) job.block[j].size=c+1;
7170 if (job.block[j].files.size()==0 || job.block[j].files.back()!=p)
7171 job.block[j].files.push_back(p);
7172 if (p->second.dtv.back().size<0) job.block[j].streaming=true;
7173 }
7174 total_size+=p->second.dtv.back().size;
7175 if (p->first!="" && p->first[size(p->first)-1]!='/') ++total_files;
7176 }
7177 }
7178
7179 // Decompress archive in parallel
7180 if (quiet<MAX_QUIET)
7181 fprintf(con, "%s %1.6f MB in %d files with %d jobs\n",
7182 (istest ? "Testing" : "Extracting"),
7183 total_size/1000000.0, total_files, threads);
7184 vector<ThreadID> tid(threads);
7185 for (int i=0; i<size(tid); ++i) run(tid[i], decompressThread, &job);
7186
7187 // Decompress streaming files in a single thread
7188 Archive in;
7189 if (!in.open(archive.c_str(), password)) return 1;
7190 OutputFile out;
7191 DTMap::iterator p=dt.end(); // currently open output file (initially none)
7192 string lastfile=archive; // default output file: drop .zpaq from archive
7193 if (lastfile.size()>5 && lastfile.substr(lastfile.size()-5)==".zpaq")
7194 lastfile=lastfile.substr(0, lastfile.size()-5);
7195 bool first=true;
7196 for (unsigned i=0; i<job.block.size(); ++i) {
7197 Block& b=job.block[i];
7198 if (b.size==0 || !b.streaming) continue;
7199 if (quiet<MAX_QUIET-1)
7200 fprintf(con, "main: [%d..%d] block %d\n", b.start, b.start+b.size-1,
7201 i+1);
7202 try {
7203 libzpaq::Decompresser d;
7204 libzpaq::SHA1 sha1;
7205 d.setInput(&in);
7206 d.setSHA1(&sha1);
7207 if (out.isopen()) d.setOutput(&out);
7208 else d.setOutput(0);
7209 in.seek(b.offset, SEEK_SET);
7210 if (!d.findBlock()) error("findBlock failed");
7211 StringWriter filename;
7212
7213 // decompress segments
7214 for (int j=0; d.findFilename(&filename); ++j) {
7215 d.readComment();
7216
7217 // Named segment starts new file
7218 if (filename.s.size()>0 || first) {
7219 for (unsigned i=0; i<filename.s.size(); ++i)
7220 if (filename.s[i]=='\\') filename.s[i]='/';
7221 if (filename.s.size()>0) lastfile=filename.s;
7222 if (out.isopen()) {
7223 out.close();
7224 p=dt.end();
7225 }
7226 first=false;
7227 string newfile;
7228 p=dt.find(lastfile);
7229 if (p!=dt.end() && p->second.written==0) { // todo
7230 newfile=rename(lastfile);
7231 if (!istest) makepath(newfile);
7232 if (istest || out.open(newfile.c_str())) {
7233 if (quiet<MAX_QUIET-1) {
7234 if (istest) fprintf(con, "main: testing ");
7235 else fprintf(con, "main: extracting ");
7236 printUTF8(newfile.c_str(), con);
7237 fprintf(con, "\n");
7238 }
7239 if (!istest) out.truncate(0);
7240 }
7241 if (out.isopen()) d.setOutput(&out);
7242 else {
7243 d.setOutput(0);
7244 if (!istest) p=dt.end();
7245 }
7246 }
7247 }
7248 filename.s="";
7249
7250 // Decompress, verify checksum
7251 if (j<b.size) {
7252 d.decompress();
7253 char sha1out[21];
7254 d.readSegmentEnd(sha1out);
7255 if (!fragile && sha1out[0] && memcmp(sha1out+1, sha1.result(), 20))
7256 error("checksum error");
7257 else {
7258 assert(b.start+j<ht.size());
7259 lock(job.mutex);
7260 ht[b.start+j].csize=EXTRACTED;
7261 release(job.mutex);
7262 if (p!=dt.end()) ++p->second.written;
7263 }
7264 }
7265 else
7266 break;
7267 }
7268 }
7269 catch (std::exception& e) {
7270 fprintf(stderr, "main: skipping frags %u-%u at offset %1.0f: %s\n",
7271 b.start, b.start+b.size-1, double(in.tell()), e.what());
7272 continue;
7273 }
7274 }
7275
7276 // Wait for threads to finish
7277 for (int i=0; i<size(tid); ++i) join(tid[i]);
7278
7279 // Create empty directories and set directory dates and attributes
7280 if (!istest) {
7281 for (DTMap::reverse_iterator p=dt.rbegin(); p!=dt.rend(); ++p) {
7282 if (p->second.written==0 && p->first!=""
7283 && p->first[p->first.size()-1]=='/') {
7284 string s=rename(p->first);
7285 if (p->second.dtv.size())
7286 makepath(s, p->second.dtv.back().date, p->second.dtv.back().attr);
7287 }
7288 }
7289 }
7290
7291 // Report failed extractions
7292 unsigned extracted=0, errors=0;
7293 for (DTMap::iterator p=dt.begin(); p!=dt.end(); ++p) {
7294 string fn=rename(p->first);
7295 if (p->second.dtv.size() && p->second.dtv.back().date
7296 && p->second.written>=0 && fn!="" && fn[fn.size()-1]!='/') {
7297 DTV& dtv=p->second.dtv.back();
7298 ++extracted;
7299 unsigned f=0; // fragments extracted OK
7300 for (unsigned j=0; j<dtv.ptr.size(); ++j) {
7301 const unsigned k=dtv.ptr[j];
7302 if (k>0 && k<ht.size() && ht[k].csize==EXTRACTED) ++f;
7303 }
7304 if (f!=dtv.ptr.size() || f!=unsigned(p->second.written)) {
7305 if (++errors==1)
7306 fprintf(stderr,
7307 "\nFailed (extracted,written/total fragments, version, file):\n");
7308 fprintf(stderr, "%u,%u/%u %d ",
7309 f, p->second.written, int(dtv.ptr.size()), dtv.version);
7310 printUTF8(fn.c_str(), stderr);
7311 fprintf(stderr, "\n");
7312 }
7313 }
7314 }
7315 if (quiet<MAX_QUIET || errors>0) {
7316 fprintf((errors>0 ? stderr : con),
7317 "\n%s %u of %u files OK (%u errors) using %1.3f MB x %d threads\n",
7318 (istest ? "Tested" : "Extracted"),
7319 extracted-errors, extracted, errors, job.maxMemory/1000000,
7320 size(tid));
7321 }
7322 return errors>0;
7323 }
7324
7325 /////////////////////////////// list //////////////////////////////////
7326
7327 // For counting files and sizes by -list -summary
7328 struct TOP {
7329 double csize; // compressed size
7330 int64_t size; // uncompressed size
7331 int count; // number of files
TOPTOP7332 TOP(): csize(0), size(0), count(0) {}
incTOP7333 void inc(int64_t n) {size+=n; ++count;}
incTOP7334 void inc(DTMap::const_iterator p) {
7335 if (p->second.dtv.size()>0) {
7336 size+=p->second.dtv.back().size;
7337 csize+=p->second.dtv.back().csize;
7338 ++count;
7339 }
7340 }
7341 };
7342
list_versions(int64_t csize)7343 void Jidac::list_versions(int64_t csize) {
7344 fprintf(con, "\n"
7345 "Ver Last frag Date Time (UT) Files Deleted"
7346 " Original MB Compressed MB\n"
7347 "---- -------- ---------- -------- ------ ------ "
7348 "-------------- --------------\n");
7349 if (since<0) since+=size(ver);
7350 for (int i=since; i<size(ver); ++i) {
7351 int64_t osize=((i<size(ver)-1 ? ver[i+1].offset : csize)-ver[i].offset);
7352 if (i==0 && ver[i].updates==0
7353 && ver[i].deletes==0 && ver[i].date==0 && ver[i].usize==0)
7354 continue;
7355 fprintf(con, "%4d %8d %s %6d %6d %14.6f %14.6f\n", i,
7356 i<size(ver)-1 ? ver[i+1].firstFragment-1 : size(ht)-1,
7357 dateToString(ver[i].date).c_str(),
7358 ver[i].updates, ver[i].deletes, ver[i].usize/1000000.0,
7359 osize/1000000.0);
7360 }
7361 }
7362
7363 // Return p<q for sorting files by decreasing size, then fragment ID list
compareFragmentList(DTMap::const_iterator p,DTMap::const_iterator q)7364 bool compareFragmentList(DTMap::const_iterator p, DTMap::const_iterator q) {
7365 if (q->second.dtv.size()==0) return false;
7366 if (p->second.dtv.size()==0) return true;
7367 int64_t d=p->second.dtv.back().size-q->second.dtv.back().size;
7368 if (d!=0) return d>0;
7369 if (p->second.dtv.back().ptr<q->second.dtv.back().ptr) return true;
7370 if (q->second.dtv.back().ptr<p->second.dtv.back().ptr) return false;
7371 return p->first<q->first;
7372 }
7373
7374 // List contents
list()7375 void Jidac::list() {
7376
7377 // Read archive, which may be "" for empty.
7378 int64_t csize=0;
7379 if (archive!="") {
7380 csize=read_archive();
7381 if (csize==0) exit(1);
7382 }
7383
7384 // Summary. Show only the largest files and directories, sorted by size,
7385 // and block and fragment usage statistics.
7386 if (summary) {
7387 read_args(false);
7388 if (quiet>=MAX_QUIET) return;
7389
7390 // Report biggest files, directories, and extensions
7391 fprintf(con,
7392 "\nRank Size (MB) Ratio Files File, Directory/, or .Type\n"
7393 "---- -------------- ------ --------- --------------------------\n");
7394 map<string, TOP> top; // filename or dir -> total size and count
7395 vector<int> frag(ht.size()); // frag ID -> reference count
7396 int unknown_ref=0; // count fragments and references with unknown size
7397 int unknown_size=0;
7398 for (DTMap::const_iterator p=dt.begin(); p!=dt.end(); ++p) {
7399 if (p->second.dtv.size() && p->second.dtv.back().date
7400 && p->second.written==0) {
7401 top[""].inc(p);
7402 top[p->first].inc(p);
7403 int ext=0; // location of . in filename
7404 for (unsigned i=0; i<p->first.size(); ++i) {
7405 if (p->first[i]=='/') {
7406 top[p->first.substr(0, i+1)].inc(p);
7407 ext=0;
7408 }
7409 else if (p->first[i]=='.') ext=i;
7410 }
7411 if (ext)
7412 top[lowercase(p->first.substr(ext))].inc(p);
7413 else
7414 top["."].inc(p);
7415 for (unsigned i=0; i<p->second.dtv.back().ptr.size(); ++i) {
7416 const unsigned j=p->second.dtv.back().ptr[i];
7417 if (j<frag.size()) {
7418 ++frag[j];
7419 if (ht[j].usize<0) ++unknown_ref;
7420 }
7421 }
7422 }
7423 }
7424 map<int64_t, vector<string> > st;
7425 for (map<string, TOP>::const_iterator p=top.begin();
7426 p!=top.end(); ++p)
7427 st[-p->second.size].push_back(p->first);
7428 int i=1;
7429 for (map<int64_t, vector<string> >::const_iterator p=st.begin();
7430 p!=st.end() && i<=summary; ++p) {
7431 for (unsigned j=0; i<=summary && j<p->second.size(); ++i, ++j) {
7432 fprintf(con, "%4d %14.6f %6.4f %9d ", i, (-p->first)/1000000.0,
7433 top[p->second[j].c_str()].csize/max(int64_t(1), -p->first),
7434 top[p->second[j].c_str()].count);
7435 printUTF8(p->second[j].c_str(), con);
7436 fprintf(con, "\n");
7437 }
7438 }
7439
7440 // Report block and fragment usage statistics
7441 fprintf(con, "\nShares Fragments Deduplicated MB Extracted MB\n"
7442 "------ --------- --------------- ---------------\n");
7443 map<unsigned, TOP> fr, frc; // refs -> deduplicated, extracted count, size
7444 if (since<0) since+=size(ver);
7445 if (since<1) since=1;
7446 if (since>=size(ver)) since=size(ver)-1;
7447 for (unsigned i=ver[since].firstFragment; i<frag.size(); ++i) {
7448 assert(i<ht.size());
7449 int j=frag[i];
7450 if (j>10) j=10;
7451 fr[j].inc(ht[i].usize);
7452 fr[-1].inc(ht[i].usize);
7453 frc[j].inc(int64_t(ht[i].usize)*frag[i]);
7454 frc[-1].inc(int64_t(ht[i].usize)*frag[i]);
7455 if (ht[i].usize<0) ++unknown_size;
7456 }
7457 for (map<unsigned, TOP>::const_iterator p=fr.begin(); p!=fr.end(); ++p) {
7458 if (int(p->first)==-1) fprintf(con, " Total ");
7459 else if (p->first==10) fprintf(con, " 10+ ");
7460 else fprintf(con, "%6u ", p->first);
7461 fprintf(con, "%9d %15.6f %15.6f\n", p->second.count,
7462 p->second.size/1000000.0, frc[p->first].size/1000000.0);
7463 }
7464
7465 // Print versions
7466 list_versions(csize);
7467
7468 // Report fragments with unknown size
7469 fprintf(con, "\n%d references to %d of %d fragments have unknown size.\n",
7470 unknown_ref, unknown_size, size(ht)-1);
7471
7472 // Count blocks and used blocks
7473 int blocks=0, used=0, isused=0;
7474 for (unsigned i=1; i<ht.size(); ++i) {
7475 if (ht[i].csize>=0) {
7476 ++blocks;
7477 used+=isused;
7478 isused=0;
7479 }
7480 isused|=frag[i]>0;
7481 }
7482 used+=isused;
7483 const double usize=top[""].size;
7484 fprintf(con, "%d of %d blocks used.\nCompression %1.6f -> %1.6f MB",
7485 used, blocks, usize/1000000.0, csize/1000000.0);
7486 if (usize>0) fprintf(con, " (ratio %1.3f%%)", csize*100.0/usize);
7487 fprintf(con, "\n");
7488 return;
7489 }
7490
7491 // Make list of files to list
7492 read_args(false, all);
7493 vector<DTMap::const_iterator> filelist;
7494 for (DTMap::const_iterator p=dt.begin(); p!=dt.end(); ++p)
7495 if (p->second.written==0)
7496 filelist.push_back(p);
7497 if (duplicates)
7498 sort(filelist.begin(), filelist.end(), compareFragmentList);
7499
7500 // Ordinary list
7501 int64_t usize=0;
7502 unsigned nfiles=0, shown=0;
7503 if (quiet<MAX_QUIET)
7504 fprintf(con, "\n"
7505 " Ver Date Time (UT) %s Size Ratio File\n"
7506 "----- ---------- -------- %s------------ ------ ----\n",
7507 noattributes?"":"Attr ", noattributes?"":"------ ");
7508 for (unsigned fi=0; fi<filelist.size(); ++fi) {
7509 DTMap::const_iterator p=filelist[fi];
7510 for (unsigned i=0; i<p->second.dtv.size(); ++i) {
7511 if (p->second.dtv[i].version>=since && p->second.dtv[i].size>=quiet
7512 && (all || (i+1==p->second.dtv.size() && p->second.dtv[i].date))) {
7513 if (duplicates && fi>0 && filelist[fi-1]->second.dtv.size()
7514 && p->second.dtv[i].ptr==filelist[fi-1]->second.dtv.back().ptr)
7515 fprintf(con, "=");
7516 else
7517 fprintf(con, ">");
7518 fprintf(con, "%4d ", p->second.dtv[i].version);
7519 if (p->second.dtv[i].date) {
7520 ++shown;
7521 usize+=p->second.dtv[i].size;
7522 double ratio=1.0;
7523 if (p->second.dtv[i].size>0)
7524 ratio=p->second.dtv[i].csize/p->second.dtv[i].size;
7525 if (ratio>9.9999) ratio=9.9999;
7526 fprintf(con, "%s %s%12.0f %6.4f ",
7527 dateToString(p->second.dtv[i].date).c_str(),
7528 noattributes ? "" :
7529 (attrToString(p->second.dtv[i].attr)+" ").c_str(),
7530 double(p->second.dtv[i].size), ratio);
7531 }
7532 else {
7533 fprintf(con, "%-40s", "Deleted");
7534 if (!noattributes) fprintf(con, " ");
7535 }
7536 string s=rename(p->first);
7537 printUTF8(p->first.c_str(), con);
7538 if (s!=p->first) {
7539 fprintf(con, " -> ");
7540 printUTF8(s.c_str(), con);
7541 }
7542 fprintf(con, "\n");
7543 }
7544 }
7545 if (p->second.dtv.size() && p->second.dtv.back().date) ++nfiles;
7546 }
7547 if (quiet<MAX_QUIET) {
7548 fprintf(con, "%u of %u files shown. %1.0f -> %1.0f\n",
7549 shown, nfiles, double(usize), double(csize+dhsize-dcsize));
7550 if (dhsize!=dcsize) // index?
7551 fprintf(con, "Note: %1.0f of %1.0f compressed bytes are in archive\n",
7552 dcsize+0.0, dhsize+0.0);
7553 }
7554 if (all) list_versions(csize);
7555 }
7556
7557 /////////////////////////////// compare ///////////////////////////////
7558
7559 // Compare archive with external files or archive and list differences.
7560 // Return 1 if differences are found, else 0.
compare()7561 int Jidac::compare() {
7562 int count=0, differences=0;
7563
7564 // Compare -with another archive
7565 if (with!="") {
7566 if (size(with)<5 || with.substr(size(with)-5)!=".zpaq") with+=".zpaq";
7567 Jidac jidac(*this);
7568 if (archive!="") read_archive();
7569 jidac.read_archive(0, with.c_str());
7570 read_args(false);
7571 jidac.read_args(false);
7572
7573 // test all files in first archive
7574 for (DTMap::iterator p=dt.begin(); p!=dt.end(); ++p) {
7575 if (p->second.written!=0) continue;
7576 if (p->second.dtv.size()<1) continue;
7577 DTV& dp=p->second.dtv.back();
7578 if (dp.date==0) continue;
7579 ++count;
7580 bool isequal=false;
7581 DTMap::iterator q=jidac.dt.find(rename(p->first));
7582 if (q!=jidac.dt.end() && q->second.dtv.size()>0
7583 && q->second.dtv.back().date>0) { // exists in both archives
7584 DTV& dq=q->second.dtv.back();
7585 isequal=(!all || dp.date==dq.date)
7586 && dp.size==dq.size
7587 && (!all || noattributes || dp.attr==dq.attr)
7588 && dp.ptr.size()==dq.ptr.size();
7589 for (unsigned i=0; isequal && i<dp.ptr.size(); ++i) {
7590 if (dp.ptr[i]<1 || dp.ptr[i]>ht.size())
7591 error("bad ptr in first archive");
7592 if (dq.ptr[i]<1 || dq.ptr[i]>jidac.ht.size())
7593 error("bad ptr in second archive");
7594 if (ht[dp.ptr[i]].usize!=jidac.ht[dq.ptr[i]].usize)
7595 isequal=false;
7596 else if (memcmp(ht[dp.ptr[i]].sha1, jidac.ht[dq.ptr[i]].sha1, 20))
7597 isequal=false;
7598 }
7599 if (!isequal && (dp.size>=quiet || dq.size>=quiet)) {
7600 fprintf(con, "< %s %s%12.0f ",
7601 dateToString(dq.date).c_str(),
7602 noattributes ? "" : (attrToString(dq.attr)+" ").c_str(),
7603 double(dq.size));
7604 printUTF8(rename(q->first).c_str(), con);
7605 fprintf(con, "\n");
7606 }
7607 }
7608 if (!isequal) ++differences;
7609 if (!isequal && dp.size>=quiet) {
7610 fprintf(con, "> %s %s%12.0f ",
7611 dateToString(dp.date).c_str(),
7612 noattributes ? "" : (attrToString(dp.attr)+" ").c_str(),
7613 double(dp.size));
7614 printUTF8(p->first.c_str(), con);
7615 fprintf(con, "\n");
7616 }
7617 }
7618
7619 // list files in second archive but not the first
7620 for (DTMap::iterator p=jidac.dt.begin(); p!=jidac.dt.end(); ++p) {
7621 if (p->second.written!=0) continue;
7622 if (p->second.dtv.size()<1) continue;
7623 if (p->second.dtv.back().date==0) continue;
7624 DTMap::iterator q=dt.find(unrename(p->first));
7625 if (q==dt.end() || q->second.dtv.size()==0
7626 || q->second.written!=0 || q->second.dtv.back().date==0) {
7627 ++count;
7628 ++differences;
7629 DTV& dp=p->second.dtv.back();
7630 if (dp.size>=quiet) {
7631 fprintf(con, "< %s %s%12.0f ",
7632 dateToString(dp.date).c_str(),
7633 noattributes ? "" : (attrToString(dp.attr)+" ").c_str(),
7634 double(dp.size));
7635 printUTF8(rename(p->first).c_str(), con);
7636 fprintf(con, "\n");
7637 }
7638 }
7639 }
7640 if (quiet<MAX_QUIET)
7641 fprintf(con, "%d of %d files differ\n", differences, count);
7642 return differences>0;
7643 }
7644
7645 // Compare with external files
7646 if (archive!="") read_archive();
7647 read_args(true);
7648 for (DTMap::iterator p=dt.begin(); p!=dt.end(); ++p) {
7649 if (p->second.written!=0) continue;
7650 bool isequal=p->second.edate!=0 && p->second.dtv.size()>0
7651 && p->second.dtv.back().date!=0; // both files exist?
7652 if (all && isequal)
7653 isequal=p->second.edate==p->second.dtv.back().date; // same date?
7654 if (all && isequal && (p->second.eattr&255) // attributes present,
7655 && (p->second.dtv.back().attr&255) // same OS, and different?
7656 && ((p->second.eattr^p->second.dtv.back().attr)&255)==0)
7657 isequal=p->second.eattr==p->second.dtv.back().attr;
7658 if (isequal)
7659 isequal=equal(p); // same contents
7660 ++count;
7661 if (!isequal) ++differences;
7662 if (!isequal && (p->second.esize>=quiet
7663 || (p->second.dtv.size()>0 && p->second.dtv.back().size>=quiet))) {
7664 if (p->second.dtv.size()>0 && p->second.dtv.back().date) {
7665 fprintf(con, "> %s %s%12.0f ",
7666 dateToString(p->second.dtv.back().date).c_str(),
7667 noattributes ? "" :
7668 (attrToString(p->second.dtv.back().attr)+" ").c_str(),
7669 double(p->second.dtv.back().size));
7670 printUTF8(p->first.c_str(), con);
7671 fprintf(con, "\n");
7672 }
7673 if (p->second.edate) {
7674 fprintf(con, "< %s %s%12.0f ",
7675 dateToString(p->second.edate).c_str(),
7676 noattributes ? "" :
7677 (attrToString(p->second.eattr)+" ").c_str(),
7678 double(p->second.esize));
7679 printUTF8(rename(p->first).c_str(), con);
7680 fprintf(con, "\n");
7681 }
7682 }
7683 }
7684 if (quiet<MAX_QUIET)
7685 fprintf(con, "%d of %d files differ\n", differences, count);
7686 return differences>0;
7687 }
7688
7689 /////////////////////////////// purge /////////////////////////////////
7690
7691 // Block list element
7692 struct BL {
7693 int64_t start; // archive offset
7694 int64_t end; // last byte + 1
7695 unsigned used; // number of references
7696 unsigned firstFragment;
7697 bool streaming; // not journaling?
BLBL7698 BL(): start(-1), end(-1), used(0), firstFragment(0), streaming(true) {}
7699 };
7700
7701 // Find filename in ZPAQ segment header of form "jDC<date>d<num>"
7702 // and substitute date (14 digits) and num (10 digits). Assume that
7703 // s[0..n-1] is the start of a ZPAQ block with or without a tag.
7704 // Return 0 if successful else error code > 0
setFilename(char * s,int n,int64_t date,unsigned num)7705 int setFilename(char* s, int n, int64_t date, unsigned num) {
7706 if (!s) return 1;
7707 if (*s=='7' && n>13) s+=13, n-=13; // skip tag
7708 if (n<7) return 2;
7709 if (s[0]!='z') return 3;
7710 if (s[1]!='P') return 4;
7711 if (s[2]!='Q') return 5;
7712 int hsize=(s[5]&255)+(s[6]&255)*256+7;
7713 s+=hsize, n-=hsize;
7714 if (n<30) return 6;
7715 if (s[0]!=1) return 7;
7716 if (s[1]!='j') return 8;
7717 if (s[2]!='D') return 9;
7718 if (s[3]!='C') return 10;
7719 if (s[29]!=0) return 11;
7720 string sd=itos(date, 14)+s[18]+itos(num, 10);
7721 memcpy(s+4, sd.c_str(), 25);
7722 return 0;
7723 }
7724
7725 // Copy current version only to first tofiles.zpaq.
7726 // If tofiles[0] is "" then check for errors but discard output.
purge()7727 void Jidac::purge() {
7728
7729 // Check -to option
7730 Archive in, out;
7731 Counter counter;
7732 libzpaq::Writer* outp=&out;
7733 if (size(tofiles)!=1) error("Missing: -to new_archive");
7734 string output=tofiles[0];
7735 if (output=="")
7736 outp=&counter;
7737 else if (size(output)<5 || output.substr(output.size()-5)!=".zpaq")
7738 output+=".zpaq";
7739 for (int i=0; i<size(output); ++i)
7740 if (output[i]=='?' || output[i]=='*')
7741 error("Output archive cannot be multi-part");
7742 if (output==archive)
7743 error("Cannot purge to self");
7744 else if (!force && exists(output, 1))
7745 error("Output archive already exists");
7746
7747 // Copy only, possibly with concatenation or a different key
7748 if (all) {
7749 if (!in.open(archive.c_str(), password, 'r'))
7750 error("archive not found");
7751 if (!out.open(output.c_str(), new_password, 'w', 0))
7752 error("cannot create output archive");
7753 int n;
7754 char buf[1<<14];
7755 while ((n=in.read(buf, 1<<14))>0)
7756 out.write(buf, n);
7757 if (quiet<MAX_QUIET) {
7758 printUTF8(archive.c_str(), con);
7759 fprintf(con, " %1.0f -> ", in.tell()+0.0);
7760 printUTF8(output.c_str(), con);
7761 fprintf(con, " %1.0f\n", out.tell()+0.0);
7762 }
7763 out.close();
7764 in.close();
7765 return;
7766 }
7767
7768 // Read archive to purge
7769 int errors=0;
7770 const int64_t archive_size=read_archive(&errors);
7771 if (archive_size==0) return;
7772 if (errors) error("cannot purge archive with errors");
7773
7774 // Make a list of data blocks. Each block ends at the start of the
7775 // next block or at end of archive.
7776 vector<BL> blist(1); // first element unused
7777 for (unsigned i=1; i<ht.size(); ++i) {
7778 if (ht[i].csize>=0 && ht[i].csize!=HT_BAD) {
7779 BL bl;
7780 blist.back().end=bl.start=ht[i].csize;
7781 bl.end=archive_size;
7782 bl.firstFragment=i;
7783 blist.push_back(bl);
7784 }
7785 }
7786
7787 // Chop blocks if a version header or index starts in the middle of it.
7788 // Mark blocks between the header and index as not streaming.
7789 for (unsigned i=1; i<ver.size(); ++i) {
7790 if (ver[i].csize>=0) { // header and index exists?
7791 for (unsigned j=1; j<blist.size(); ++j) {
7792 if (ver[i].offset>blist[j].start && ver[i].offset<blist[j].end)
7793 blist[j].end=ver[i].offset;
7794 if (ver[i].firstFragment>=1 && ver[i].firstFragment<ht.size()
7795 && ht[ver[i].firstFragment].csize>=0) {
7796 int64_t end=ht[ver[i].firstFragment].csize+ver[i].csize;
7797 if (end>blist[j].start && end<blist[j].end)
7798 blist[j].end=end;
7799 if (blist[j].start>ver[i].offset && blist[j].end<=end)
7800 blist[j].streaming=false;
7801 }
7802 }
7803 }
7804 }
7805
7806 // Test that blocks are sorted, have non-negative start and size,
7807 // don't overlap, and are not streaming. Build index bx.
7808 map<int64_t, unsigned> bx; // block start -> block number
7809 for (unsigned i=1; i<blist.size(); ++i) {
7810 if (blist[i].start<0)
7811 error("negative block start");
7812 if (blist[i].end<blist[i].start)
7813 error("negative block size");
7814 if (i>0 && blist[i].start<blist[i-1].end)
7815 error("unsorted block list");
7816 if (blist[i].streaming)
7817 error("cannot purge archive with streaming data");
7818 bx[blist[i].start]=i;
7819 }
7820
7821 // Mark used blocks if referenced by files in current version.
7822 for (DTMap::iterator p=dt.begin(); p!=dt.end(); ++p) {
7823 if (p->second.dtv.size() && p->second.dtv.back().date) {
7824 for (unsigned i=0; i<p->second.dtv.back().ptr.size(); ++i) {
7825 unsigned j=p->second.dtv.back().ptr[i];
7826 if (j==0 || j>=ht.size() || ht[j].csize==HT_BAD)
7827 error("bad fragment pointer");
7828 if (ht[j].csize<0) j+=ht[j].csize; // start of block
7829 if (j<1 || j>=ht.size() || ht[j].csize==HT_BAD)
7830 error("bad fragment offset");
7831 j=bx[ht[j].csize]; // block number
7832 if (j<1 || j>=blist.size()) error("missing block");
7833 ++blist[j].used;
7834 }
7835 }
7836 }
7837
7838 // Pack fragment ids to remove gaps
7839 vector<unsigned> fmap(ht.size()); // old -> new fragment id
7840 for (unsigned i=1, k=1; i<blist.size(); ++i) {
7841 for (unsigned j=blist[i].firstFragment;
7842 j<ht.size() && (i+1>=blist.size() || j<blist[i+1].firstFragment);
7843 ++j) {
7844 if (blist[i].used && ht[j].csize!=HT_BAD)
7845 fmap[j]=k++;
7846 }
7847 }
7848
7849 // Prepare temp header
7850 StringBuffer hdr;
7851 writeJidacHeader(&hdr, date, -1, 1);
7852
7853 // Report space saved
7854 int64_t deleted_bytes=0;
7855 unsigned deleted_blocks=0;
7856 for (unsigned i=1; i<blist.size(); ++i) {
7857 if (!blist[i].used) {
7858 deleted_bytes+=blist[i].end-blist[i].start;
7859 ++deleted_blocks;
7860 }
7861 }
7862 if (quiet<MAX_QUIET)
7863 fprintf(con, "%1.0f bytes in %u blocks will be purged\n",
7864 double(deleted_bytes), deleted_blocks);
7865
7866 // Open input
7867 if (!in.open(archive.c_str(), password, 'r')) return;
7868
7869 // Test blocks. They should start with "7kS" or "zPQ" and end with 0xff.
7870 for (unsigned i=1; i<blist.size(); ++i) {
7871 in.seek(blist[i].start, SEEK_SET);
7872 int c1=in.get();
7873 int c2=in.get();
7874 int c3=in.get();
7875 if ((c1!='7' || c2!='k' || c3!='S') && (c1!='z' || c2!='P' || c3!='Q'))
7876 error("bad block start");
7877 in.seek(blist[i].end-1, SEEK_SET);
7878 c1=in.get();
7879 if (c1!=255) error("bad block end");
7880 }
7881 if (quiet<MAX_QUIET)
7882 fprintf(con, "%d block locations test OK\n", size(blist)-1);
7883
7884 // Open output.zpaq for output
7885 assert(size(tofiles)==1);
7886 if (output!="") {
7887 if (!out.open(output.c_str(), new_password, 'w', 0))
7888 error("Archive open failed");
7889 }
7890
7891 // Write temporary header
7892 outp->write(hdr.c_str(), hdr.size());
7893
7894 // Copy referenced data blocks
7895 const int N=1<<17;
7896 libzpaq::Array<char> buf(N);
7897 const int64_t cdatastart=out.tell();
7898 for (unsigned i=1; i<blist.size(); ++i) {
7899 if (blist[i].used) {
7900 in.seek(blist[i].start, SEEK_SET);
7901 int n=0;
7902 bool first=true;
7903 for (int64_t j=blist[i].start; j<=blist[i].end; ++j) {
7904 if (n==N || (n>0 && j==blist[i].end)) {
7905 if (first) {
7906 unsigned f=blist[i].firstFragment;
7907 if (f<1 || f>=fmap.size())
7908 error("blist[i].firstFragment out of range");
7909 f=fmap[f];
7910 if (f<1) error("unmapped firstFragment");
7911 if (setFilename(&buf[0],n, date, f))
7912 error("d block filename update failed");
7913 first=false;
7914 }
7915 outp->write(&buf[0], n);
7916 n=0;
7917 }
7918 assert(n<N);
7919 if (j<blist[i].end) {
7920 int c=in.get();
7921 if (c==EOF) error("unexpected EOF");
7922 buf[n++]=c;
7923 }
7924 }
7925 }
7926 }
7927 in.close();
7928 const int64_t cdatasize=out.tell()-cdatastart;
7929
7930 // Write fragment tables
7931 StringBuffer is;
7932 for (unsigned i=1; i<blist.size(); ++i) {
7933 unsigned j=blist[i].firstFragment;
7934 assert(j>0 && j<ht.size() && ht[j].csize!=HT_BAD);
7935 assert(is.size()==0);
7936 if (blist[i].used) {
7937 is+=itob(blist[i].end-blist[i].start);
7938 for (unsigned k=j; k<ht.size() && (k==j || j-ht[k].csize==k); ++k)
7939 is+=string(ht[k].sha1, ht[k].sha1+20)+itob(ht[k].usize);
7940 assert(fmap[j]>0);
7941 compressBlock(&is, outp, "0",
7942 ("jDC"+itos(date, 14)+"h"+itos(fmap[j], 10)).c_str());
7943 }
7944 }
7945
7946 // Append compressed index to archive
7947 int dtcount=0;
7948 assert(is.size()==0);
7949 for (DTMap::const_iterator p=dt.begin(); p!=dt.end();) {
7950 if (p->second.dtv.size()>0 && p->second.dtv.back().date) {
7951 const DTV& dtr=p->second.dtv.back();
7952 is+=ltob(dtr.date)+p->first+'\0';
7953 if ((dtr.attr&255)=='u') { // unix attributes
7954 is+=itob(3);
7955 is.put('u');
7956 is.put(dtr.attr>>8&255);
7957 is.put(dtr.attr>>16&255);
7958 }
7959 else if ((dtr.attr&255)=='w') { // windows attributes
7960 is+=itob(5);
7961 is.put('w');
7962 is+=itob(dtr.attr>>8);
7963 }
7964 else is+=itob(0);
7965 is+=itob(size(dtr.ptr)); // list of frag pointers
7966 for (int i=0; i<size(dtr.ptr); ++i) {
7967 unsigned j=dtr.ptr[i];
7968 if (j<1 || j>=fmap.size()) error("bad unmapped frag pointer");
7969 j=fmap[j];
7970 if (j<1 || j>=fmap.size()) error("bad mapped frag pointer");
7971 is+=itob(j);
7972 }
7973 }
7974 ++p;
7975 if (is.size()>16000 || (is.size()>0 && p==dt.end())) {
7976 compressBlock(&is, outp, "1",
7977 ("jDC"+itos(date)+"i"+itos(++dtcount, 10)).c_str());
7978 assert(is.size()==0);
7979 }
7980 if (p==dt.end()) break;
7981 }
7982
7983 // Complete the update
7984 int64_t new_archive_size=0;
7985 if (outp==&out) {
7986 new_archive_size=out.tell();
7987 out.seek(32*(new_password!=0), SEEK_SET);
7988 writeJidacHeader(&out, date, cdatasize, 1);
7989 if (out.tell()!=size(hdr)+32*(new_password!=0))
7990 error("output header wrong size");
7991 out.close();
7992 }
7993 else
7994 new_archive_size=counter.pos;
7995 if (quiet<MAX_QUIET)
7996 fprintf(con, "%1.0f -> %1.0f\n",
7997 double(archive_size), double(new_archive_size));
7998 }
7999
8000 /////////////////////////////// main //////////////////////////////////
8001
8002 // Convert argv to UTF-8 and replace \ with /
8003 #ifdef unix
main(int argc,const char ** argv)8004 int main(int argc, const char** argv) {
8005 #else
8006 #ifdef _MSC_VER
8007 int wmain(int argc, LPWSTR* argw) {
8008 #else
8009 int main() {
8010 int argc=0;
8011 LPWSTR* argw=CommandLineToArgvW(GetCommandLine(), &argc);
8012 #endif
8013 vector<string> args(argc);
8014 libzpaq::Array<const char*> argp(argc);
8015 for (int i=0; i<argc; ++i) {
8016 args[i]=wtou(argw[i]);
8017 argp[i]=args[i].c_str();
8018 }
8019 const char** argv=&argp[0];
8020 #endif
8021
8022 global_start=mtime(); // get start time
8023 int errorcode=0;
8024 try {
8025 Jidac jidac;
8026 errorcode=jidac.doCommand(argc, argv);
8027 }
8028 catch (std::exception& e) {
8029 fprintf(stderr, "zpaq exiting from main: %s\n", e.what());
8030 errorcode=1;
8031 }
8032 if (quiet<MAX_QUIET) {
8033 fprintf(con, "%1.3f seconds", (mtime()-global_start)/1000.0);
8034 if (errorcode) fprintf(con, " (with errors)\n");
8035 else fprintf(con, " (all OK)\n");
8036 }
8037 return errorcode;
8038 }
8039