1 /* hzip: file compression for sorted dictionaries with optional encryption,
2 * algorithm: prefix-suffix encoding and 16-bit Huffman encoding */
3
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <string.h>
7
8 #define CODELEN 65536
9 #define BUFSIZE 65536
10 #define EXTENSION ".hz"
11
12 #define ESCAPE 31
13 #define MAGIC "hz0"
14 #define MAGIC_ENCRYPTED "hz1"
15
16 #define DESC "hzip - dictionary compression utility\n" \
17 "Usage: hzip [-h | -P password ] [file1 file2 ..]\n" \
18 " -P password encrypted compression\n" \
19 " -h display this help and exit\n"
20
21 enum { code_LEAF, code_TERM, code_NODE};
22
23 struct item {
24 unsigned short word;
25 int count;
26 char type;
27 struct item * left;
28 struct item * right;
29 };
30
fail(const char * err,const char * par)31 int fail(const char * err, const char * par) {
32 fprintf(stderr, err, par);
33 return 1;
34 }
35
code2table(struct item * tree,char ** table,char * code,int deep)36 void code2table(struct item * tree, char **table, char * code, int deep) {
37 int first = 0;
38 if (!code) {
39 first = 1;
40 code = malloc(CODELEN);
41 }
42 code[deep] = '1';
43 if (tree->left) code2table(tree->left, table, code, deep + 1);
44 if (tree->type != code_NODE) {
45 int i = tree->word;
46 code[deep] = '\0';
47 if (tree->type == code_TERM) i = CODELEN; /* terminal code */
48 table[i] = malloc(deep + 1);
49 strcpy(table[i], code);
50 }
51 code[deep] = '0';
52 if (tree->right) code2table(tree->right, table, code, deep + 1);
53 if (first) free(code);
54 }
55
newitem(int c,struct item * l,struct item * r,int t)56 struct item * newitem(int c, struct item * l, struct item * r, int t) {
57 struct item * ni = (struct item *) malloc(sizeof(struct item));
58 ni->type = t;
59 ni->word = 0;
60 ni->count = c;
61 ni->left = l;
62 ni->right = r;
63 return ni;
64 }
65
66 /* return length of the freq array */
get_freqdata(struct item *** dest,FILE * f,unsigned short * termword)67 int get_freqdata(struct item *** dest, FILE * f, unsigned short * termword) {
68 int freq[CODELEN];
69 int i, j, k, n;
70 union {
71 char c[2];
72 unsigned short word;
73 } u;
74 for (i = 0; i < CODELEN; i++) freq[i] = 0;
75 while((j = getc(f)) != -1 && (k = getc(f)) != -1) {
76 u.c[0] = j;
77 u.c[1] = k;
78 freq[u.word]++;
79 }
80 if (j != -1) {
81 u.c[0] = 1;
82 u.c[1] = j;
83 } else {
84 u.c[0] = 0;
85 u.c[1] = 0;
86 }
87
88 *dest = (struct item **) malloc((CODELEN + 1) * sizeof(struct item *));
89 if (!*dest) return -1;
90 for (i = 0, n = 0; i < CODELEN; i++) if (freq[i]) {
91 (*dest)[n] = newitem(freq[i], NULL, NULL, code_LEAF);
92 (*dest)[n]->word = i;
93 n++;
94 }
95 /* terminal sequence (also contains the last odd byte of the file) */
96 (*dest)[n] = newitem(1, NULL, NULL, code_TERM);
97 *termword = u.word;
98 return n + 1;
99 }
100
get_codetable(struct item ** l,int n,char ** table)101 void get_codetable(struct item **l, int n, char ** table) {
102 int i;
103 while (n > 1) {
104 int min = 0;
105 int mi2 = 1;
106 for (i = 1; i < n; i++) {
107 if (l[i]->count < l[min]->count) {
108 mi2 = min;
109 min = i;
110 } else if (l[i]->count < l[mi2]->count) mi2 = i;
111 }
112 l[min] = newitem(l[min]->count + l[mi2]->count, l[min], l[mi2], code_NODE);
113 for (i = mi2 + 1; i < n; i++) l[i - 1] = l[i];
114 n--;
115 }
116 code2table(l[0], table, NULL, 0);
117 }
118
write_bits(FILE * f,char * bitbuf,int * bits,char * code)119 int write_bits(FILE *f, char * bitbuf, int *bits, char * code) {
120 while (*code) {
121 int b = (*bits) % 8;
122 if (!b) bitbuf[(*bits) / 8] = ((*code) - '0') << 7;
123 else bitbuf[(*bits) / 8] |= (((*code) - '0') << (7 - b));
124 (*bits)++;
125 code++;
126 if (*bits == BUFSIZE * 8) {
127 if (BUFSIZE != fwrite(bitbuf, 1, BUFSIZE, f))
128 return 1;
129 *bits = 0;
130 }
131 }
132 return 0;
133 }
134
encode_file(char ** table,int n,FILE * f,FILE * f2,unsigned short tw,char * key)135 int encode_file(char ** table, int n, FILE *f, FILE *f2, unsigned short tw, char * key) {
136 char bitbuf[BUFSIZE];
137 int i, bits = 0;
138 unsigned char cl, ch;
139 int cx[2];
140 union {
141 char c[2];
142 unsigned short word;
143 } u;
144 char * enc = key;
145
146 /* header and codes */
147 fprintf(f2, "%s", (key ? MAGIC_ENCRYPTED : MAGIC)); /* 3-byte HEADER */
148 cl = (unsigned char) (n & 0x00ff);
149 ch = (unsigned char) (n >> 8);
150 if (key) {
151 unsigned char cs;
152 for (cs = 0; *enc; enc++) cs ^= *enc;
153 fprintf(f2, "%c", cs); /* 1-byte check sum */
154 enc = key;
155 ch ^= *enc;
156 if ((*(++enc)) == '\0') enc = key;
157 cl ^= *enc;
158 }
159 fprintf(f2, "%c%c", ch, cl); /* upper and lower byte of record count */
160 for (i = 0; i < BUFSIZE; i++) bitbuf[i] = '\0';
161 for (i = 0; i < CODELEN + 1; i++) if (table[i]) {
162 size_t nmemb;
163 u.word = (unsigned short) i;
164 if (i == CODELEN) u.word = tw;
165 if (key) {
166 if (*(++enc) == '\0') enc = key;
167 u.c[0] ^= *enc;
168 if (*(++enc) == '\0') enc = key;
169 u.c[1] ^= *enc;
170 }
171 fprintf(f2, "%c%c", u.c[0], u.c[1]); /* 2-character code id */
172 bits = 0;
173 if (write_bits(f2, bitbuf, &bits, table[i]) != 0)
174 return 1;
175 if (key) {
176 if (*(++enc) == '\0') enc = key;
177 fprintf(f2, "%c", ((unsigned char) bits) ^ *enc);
178 for (cl = 0; cl <= bits/8; cl++) {
179 if (*(++enc) == '\0') enc = key;
180 bitbuf[cl] ^= *enc;
181 }
182 } else
183 fprintf(f2, "%c", (unsigned char) bits); /* 1-byte code length */
184 nmemb = bits/8 + 1;
185 if (fwrite(bitbuf, 1, bits/8 + 1, f2) != nmemb) /* x-byte code */
186 return 1;
187 }
188
189 /* file encoding */
190 bits = 0;
191 while((cx[0] = getc(f)) != -1 && (cx[1] = getc(f)) != -1) {
192 u.c[0] = cx[0];
193 u.c[1] = cx[1];
194 if (write_bits(f2, bitbuf, &bits, table[u.word]) != 0)
195 return 1;
196 }
197 /* terminal suffixes */
198 if (write_bits(f2, bitbuf, &bits, table[CODELEN]) != 0)
199 return 1;
200 if (bits > 0)
201 {
202 size_t nmemb = bits/8 + 1;
203 if (fwrite(bitbuf, 1, nmemb, f2) != nmemb)
204 return 1;
205 }
206 return 0;
207 }
208
prefixcompress(FILE * f,FILE * tempfile)209 int prefixcompress(FILE *f, FILE *tempfile) {
210 char buf[BUFSIZE];
211 char buf2[BUFSIZE * 2];
212 char prev[BUFSIZE];
213 int prevlen = 0;
214 while(fgets(buf,BUFSIZE,f)) {
215 int i, j, k, m, c=0;
216 int pfx = prevlen;
217 char * p = buf2;
218 m = j = 0;
219 for (i = 0; buf[i]; i++) {
220 if ((pfx > 0) && (buf[i] == prev[i])) {
221 j++;
222 } else pfx = 0;
223 }
224 if (i > 0 && buf[i - 1] == '\n') {
225 if (j == i) j--; /* line duplicate */
226 if (j > 29) j = 29;
227 c = j;
228 if (c == '\t') c = 30;
229 /* common suffix */
230 for (; buf[i - m - 2] == prev[prevlen - m - 2] &&
231 m < i - j - 1 && m < 15; m++);
232 if (m == 1) m = 0;
233 } else {
234 j = 0;
235 m = -1;
236 }
237 for (k = j; k < i - m - 1; k++, p++) {
238 if (((unsigned char) buf[k]) < 47 && buf[k] != '\t' && buf[k] != ' ') {
239 *p = ESCAPE;
240 p++;
241 }
242 *p = buf[k];
243 }
244 if (m > 0) {
245 *p = m + 31; /* 33-46 */
246 p++;
247 }
248 if (i > 0 && buf[i - 1] == '\n') {
249 size_t nmemb = p - buf2 + 1;
250 *p = c;
251 if (fwrite(buf2, 1, nmemb, tempfile) != nmemb)
252 return 1;
253 } else {
254 size_t nmemb = p - buf2;
255 if (fwrite(buf2, 1, nmemb, tempfile) != nmemb)
256 return 1;
257 }
258 memcpy(prev, buf, i);
259 prevlen = i;
260 }
261 return 0;
262 }
263
hzip(const char * filename,char * key)264 int hzip(const char * filename, char * key) {
265 struct item ** list;
266 char * table[CODELEN + 1];
267 int n;
268 char out[BUFSIZE];
269 FILE *f, *f2, *tempfile;
270 unsigned short termword;
271 strcpy(out, filename);
272 strcat(out, EXTENSION);
273 f = fopen(filename, "r");
274 if (!f) return fail("hzip: %s: Permission denied\n", filename);
275 tempfile = tmpfile();
276 if (!tempfile) {
277 fclose(f);
278 return fail("hzip: cannot create temporary file\n", NULL);
279 }
280 f2 = fopen(out, "wb");
281 if (!f2) {
282 fclose(tempfile);
283 fclose(f);
284 return fail("hzip: %s: Permission denied\n", out);
285 }
286 for (n = 0; n < CODELEN; n++) table[n] = NULL;
287 if (prefixcompress(f, tempfile) != 0) {
288 fclose(f2);
289 fclose(tempfile);
290 fclose(f);
291 return fail("hzip: cannot write file\n", NULL);
292 }
293 rewind(tempfile);
294 n = get_freqdata(&list, tempfile, &termword);
295 get_codetable(list, n, table);
296 rewind(tempfile);
297 n = encode_file(table, n, tempfile, f2, termword, key);
298 fclose(f2);
299 fclose(tempfile);
300 fclose(f);
301 if (n != 0) return fail("hzip: cannot write file\n", NULL);
302 return n;
303 }
304
main(int argc,char ** argv)305 int main(int argc, char** argv) {
306
307 int i, j = 0;
308 char * key = NULL;
309 for (i = 1; i < argc; i++) {
310 if (*(argv[i]) == '-') {
311 if (*(argv[i] + 1) == 'h')
312 return fail(DESC, NULL);
313 if (*(argv[i] + 1) == 'P') {
314 if (i + 1 == argc)
315 return fail("hzip: missing password\n", NULL);
316 key = argv[i + 1];
317 i++;
318 continue;
319 }
320 return fail("hzip: no such option: %s\n", argv[i]);
321 } else if (hzip(argv[i], key) != 0) return 1; else j = 1;
322 }
323 if (j == 0) return fail("hzip: need a filename parameter\n", NULL);
324 return 0;
325 }
326