1 /* hzip: file compression for sorted dictionaries with optional encryption,
2  * algorithm: prefix-suffix encoding and 16-bit Huffman encoding */
3 
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <string.h>
7 
8 #define CODELEN  65536
9 #define BUFSIZE  65536
10 #define EXTENSION ".hz"
11 
12 #define ESCAPE 31
13 #define MAGIC "hz0"
14 #define MAGIC_ENCRYPTED "hz1"
15 
16 #define DESC "hzip - dictionary compression utility\n" \
17 "Usage: hzip [-h | -P password ] [file1 file2 ..]\n" \
18 "  -P password  encrypted compression\n" \
19 "  -h           display this help and exit\n"
20 
21 enum { code_LEAF, code_TERM, code_NODE};
22 
23 struct item {
24     unsigned short word;
25     int count;
26     char type;
27     struct item * left;
28     struct item * right;
29 };
30 
fail(const char * err,const char * par)31 int fail(const char * err, const char * par) {
32     fprintf(stderr, err, par);
33     return 1;
34 }
35 
code2table(struct item * tree,char ** table,char * code,int deep)36 void code2table(struct item * tree, char **table, char * code, int deep) {
37     int first = 0;
38     if (!code) {
39         first = 1;
40         code = malloc(CODELEN);
41     }
42     code[deep] = '1';
43     if (tree->left) code2table(tree->left, table, code, deep + 1);
44     if (tree->type != code_NODE) {
45         int i = tree->word;
46         code[deep] = '\0';
47         if (tree->type == code_TERM) i = CODELEN; /* terminal code */
48         table[i] = malloc(deep + 1);
49 	strcpy(table[i], code);
50     }
51     code[deep] = '0';
52     if (tree->right) code2table(tree->right, table, code, deep + 1);
53     if (first) free(code);
54 }
55 
newitem(int c,struct item * l,struct item * r,int t)56 struct item * newitem(int c, struct item * l, struct item * r, int t) {
57     struct item * ni = (struct item *) malloc(sizeof(struct item));
58     ni->type = t;
59     ni->word = 0;
60     ni->count = c;
61     ni->left = l;
62     ni->right = r;
63     return ni;
64 }
65 
66 /* return length of the freq array */
get_freqdata(struct item *** dest,FILE * f,unsigned short * termword)67 int get_freqdata(struct item *** dest, FILE * f, unsigned short * termword) {
68     int freq[CODELEN];
69     int i, j, k, n;
70     union {
71         char c[2];
72         unsigned short word;
73     } u;
74     for (i = 0; i < CODELEN; i++) freq[i] = 0;
75     while((j = getc(f)) != -1 && (k = getc(f)) != -1) {
76         u.c[0] = j;
77         u.c[1] = k;
78         freq[u.word]++;
79     }
80     if (j != -1) {
81         u.c[0] = 1;
82         u.c[1] = j;
83     } else {
84         u.c[0] = 0;
85         u.c[1] = 0;
86     }
87 
88     *dest = (struct item **) malloc((CODELEN + 1) * sizeof(struct item *));
89     if (!*dest) return -1;
90     for (i = 0, n = 0; i < CODELEN; i++) if (freq[i]) {
91        (*dest)[n] = newitem(freq[i], NULL, NULL, code_LEAF);
92        (*dest)[n]->word = i;
93        n++;
94     }
95     /* terminal sequence (also contains the last odd byte of the file) */
96     (*dest)[n] = newitem(1, NULL, NULL, code_TERM);
97     *termword = u.word;
98     return n + 1;
99 }
100 
get_codetable(struct item ** l,int n,char ** table)101 void get_codetable(struct item **l, int n, char ** table) {
102     int i;
103     while (n > 1) {
104         int min = 0;
105         int mi2 = 1;
106         for (i = 1; i < n; i++) {
107             if (l[i]->count < l[min]->count) {
108                 mi2 = min;
109                 min = i;
110             } else if (l[i]->count < l[mi2]->count) mi2 = i;
111         }
112         l[min] = newitem(l[min]->count + l[mi2]->count, l[min], l[mi2], code_NODE);
113         for (i = mi2 + 1; i < n; i++) l[i - 1] = l[i];
114         n--;
115     }
116     code2table(l[0], table, NULL, 0);
117 }
118 
write_bits(FILE * f,char * bitbuf,int * bits,char * code)119 int write_bits(FILE *f, char * bitbuf, int *bits, char * code) {
120     while (*code) {
121         int b = (*bits) % 8;
122         if (!b) bitbuf[(*bits) / 8] = ((*code) - '0') << 7;
123         else bitbuf[(*bits) / 8] |= (((*code) - '0') << (7 - b));
124         (*bits)++;
125         code++;
126         if (*bits == BUFSIZE * 8) {
127             if (BUFSIZE != fwrite(bitbuf, 1, BUFSIZE, f))
128                 return 1;
129             *bits = 0;
130         }
131     }
132     return 0;
133 }
134 
encode_file(char ** table,int n,FILE * f,FILE * f2,unsigned short tw,char * key)135 int encode_file(char ** table, int n, FILE *f, FILE *f2, unsigned short tw, char * key) {
136     char bitbuf[BUFSIZE];
137     int i, bits = 0;
138     unsigned char cl, ch;
139     int cx[2];
140     union {
141         char c[2];
142         unsigned short word;
143     } u;
144     char * enc = key;
145 
146     /* header and codes */
147     fprintf(f2, "%s", (key ? MAGIC_ENCRYPTED : MAGIC)); /* 3-byte HEADER */
148     cl = (unsigned char) (n & 0x00ff);
149     ch = (unsigned char) (n >> 8);
150     if (key) {
151         unsigned char cs;
152         for (cs = 0; *enc; enc++) cs ^= *enc;
153         fprintf(f2, "%c", cs);  /* 1-byte check sum */
154         enc = key;
155         ch ^= *enc;
156         if ((*(++enc)) == '\0') enc = key;
157         cl ^= *enc;
158     }
159     fprintf(f2, "%c%c", ch, cl);   /* upper and lower byte of record count */
160     for (i = 0; i < BUFSIZE; i++) bitbuf[i] = '\0';
161     for (i = 0; i < CODELEN + 1; i++) if (table[i]) {
162         size_t nmemb;
163         u.word = (unsigned short) i;
164         if (i == CODELEN) u.word = tw;
165         if (key) {
166             if (*(++enc) == '\0') enc = key;
167             u.c[0] ^= *enc;
168             if (*(++enc) == '\0') enc = key;
169             u.c[1] ^= *enc;
170         }
171         fprintf(f2, "%c%c", u.c[0], u.c[1]); /* 2-character code id */
172         bits = 0;
173         if (write_bits(f2, bitbuf, &bits, table[i]) != 0)
174             return 1;
175         if (key) {
176             if (*(++enc) == '\0') enc = key;
177             fprintf(f2, "%c", ((unsigned char) bits) ^ *enc);
178             for (cl = 0; cl <= bits/8; cl++) {
179                 if (*(++enc) == '\0') enc = key;
180                 bitbuf[cl] ^= *enc;
181             }
182         } else
183             fprintf(f2, "%c", (unsigned char) bits); /* 1-byte code length */
184         nmemb = bits/8 + 1;
185         if (fwrite(bitbuf, 1, bits/8 + 1, f2) != nmemb)   /* x-byte code */
186             return 1;
187     }
188 
189     /* file encoding */
190     bits = 0;
191     while((cx[0] = getc(f)) != -1 && (cx[1] = getc(f)) != -1) {
192         u.c[0] = cx[0];
193         u.c[1] = cx[1];
194         if (write_bits(f2, bitbuf, &bits, table[u.word]) != 0)
195             return 1;
196     }
197     /* terminal suffixes */
198     if (write_bits(f2, bitbuf, &bits, table[CODELEN]) != 0)
199         return 1;
200     if (bits > 0)
201     {
202         size_t nmemb = bits/8 + 1;
203         if (fwrite(bitbuf, 1, nmemb, f2) != nmemb)
204             return 1;
205     }
206     return 0;
207 }
208 
prefixcompress(FILE * f,FILE * tempfile)209 int prefixcompress(FILE *f, FILE *tempfile) {
210     char buf[BUFSIZE];
211     char buf2[BUFSIZE * 2];
212     char prev[BUFSIZE];
213     int prevlen = 0;
214     while(fgets(buf,BUFSIZE,f)) {
215         int i, j, k, m, c=0;
216         int pfx = prevlen;
217         char * p = buf2;
218         m = j = 0;
219         for (i = 0; buf[i]; i++) {
220             if ((pfx > 0) && (buf[i] == prev[i])) {
221                 j++;
222             } else pfx = 0;
223         }
224         if (i > 0 && buf[i - 1] == '\n') {
225             if (j == i) j--; /* line duplicate */
226             if (j > 29) j = 29;
227             c = j;
228             if (c == '\t') c = 30;
229             /* common suffix */
230             for (; buf[i - m - 2] == prev[prevlen - m - 2] &&
231                 m < i - j - 1 && m < 15; m++);
232             if (m == 1) m = 0;
233         } else {
234             j = 0;
235             m = -1;
236         }
237         for (k = j; k < i - m - 1; k++, p++) {
238             if (((unsigned char) buf[k]) < 47 && buf[k] != '\t' && buf[k] != ' ') {
239                 *p = ESCAPE;
240                 p++;
241             }
242             *p = buf[k];
243         }
244         if (m > 0) {
245             *p = m + 31; /* 33-46 */
246             p++;
247         }
248         if (i > 0 && buf[i - 1] == '\n') {
249             size_t nmemb = p - buf2 + 1;
250             *p = c;
251             if (fwrite(buf2, 1, nmemb, tempfile) != nmemb)
252                 return 1;
253         } else {
254             size_t nmemb = p - buf2;
255             if (fwrite(buf2, 1, nmemb, tempfile) != nmemb)
256                 return 1;
257         }
258         memcpy(prev, buf, i);
259         prevlen = i;
260     }
261     return 0;
262 }
263 
hzip(const char * filename,char * key)264 int hzip(const char * filename, char * key) {
265     struct item ** list;
266     char * table[CODELEN + 1];
267     int n;
268     char out[BUFSIZE];
269     FILE *f, *f2, *tempfile;
270     unsigned short termword;
271     strcpy(out, filename);
272     strcat(out, EXTENSION);
273     f = fopen(filename, "r");
274     if (!f) return fail("hzip: %s: Permission denied\n", filename);
275     tempfile = tmpfile();
276     if (!tempfile) {
277         fclose(f);
278         return fail("hzip: cannot create temporary file\n", NULL);
279     }
280     f2 = fopen(out, "wb");
281     if (!f2) {
282         fclose(tempfile);
283         fclose(f);
284         return fail("hzip: %s: Permission denied\n", out);
285     }
286     for (n = 0; n < CODELEN; n++) table[n] = NULL;
287     if (prefixcompress(f, tempfile) != 0) {
288         fclose(f2);
289         fclose(tempfile);
290         fclose(f);
291         return fail("hzip: cannot write file\n", NULL);
292     }
293     rewind(tempfile);
294     n = get_freqdata(&list, tempfile, &termword);
295     get_codetable(list, n, table);
296     rewind(tempfile);
297     n = encode_file(table, n, tempfile, f2, termword, key);
298     fclose(f2);
299     fclose(tempfile);
300     fclose(f);
301     if (n != 0) return fail("hzip: cannot write file\n", NULL);
302     return n;
303 }
304 
main(int argc,char ** argv)305 int main(int argc, char** argv) {
306 
307     int i, j = 0;
308     char * key = NULL;
309     for (i = 1; i < argc; i++) {
310         if (*(argv[i]) == '-') {
311             if (*(argv[i] + 1) == 'h')
312                 return fail(DESC, NULL);
313             if (*(argv[i] + 1) == 'P') {
314                 if (i + 1 == argc)
315                     return fail("hzip: missing password\n", NULL);
316                 key = argv[i + 1];
317                 i++;
318                 continue;
319             }
320             return fail("hzip: no such option: %s\n", argv[i]);
321         } else if (hzip(argv[i], key) != 0) return 1; else j = 1;
322     }
323     if (j == 0) return fail("hzip: need a filename parameter\n", NULL);
324     return 0;
325 }
326