1 /* ***** BEGIN LICENSE BLOCK *****
2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3  *
4  * Copyright (C) 2002-2017 Németh László
5  *
6  * The contents of this file are subject to the Mozilla Public License Version
7  * 1.1 (the "License"); you may not use this file except in compliance with
8  * the License. You may obtain a copy of the License at
9  * http://www.mozilla.org/MPL/
10  *
11  * Software distributed under the License is distributed on an "AS IS" basis,
12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13  * for the specific language governing rights and limitations under the
14  * License.
15  *
16  * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17  *
18  * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19  * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20  * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21  * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22  * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23  *
24  * Alternatively, the contents of this file may be used under the terms of
25  * either the GNU General Public License Version 2 or later (the "GPL"), or
26  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27  * in which case the provisions of the GPL or the LGPL are applicable instead
28  * of those above. If you wish to allow use of your version of this file only
29  * under the terms of either the GPL or the LGPL, and not to allow others to
30  * use your version of this file under the terms of the MPL, indicate your
31  * decision by deleting the provisions above and replace them with the notice
32  * and other provisions required by the GPL or the LGPL. If you do not delete
33  * the provisions above, a recipient may use your version of this file under
34  * the terms of any one of the MPL, the GPL or the LGPL.
35  *
36  * ***** END LICENSE BLOCK ***** */
37 
38 /* hzip: file compression for sorted dictionaries with optional encryption,
39  * algorithm: prefix-suffix encoding and 16-bit Huffman encoding */
40 
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <unistd.h>
45 #include <string>
46 #include <sys/stat.h>
47 
48 #define CODELEN 65536
49 #define BUFSIZE 65536
50 #define EXTENSION ".hz"
51 
52 #define ESCAPE 31
53 #define MAGIC "hz0"
54 #define MAGIC_ENCRYPTED "hz1"
55 
56 #define DESC                                           \
57   "hzip - dictionary compression utility\n"            \
58   "Usage: hzip [-h | -P password ] [file1 file2 ..]\n" \
59   "  -P password  encrypted compression\n"             \
60   "  -h           display this help and exit\n"
61 
62 enum { code_LEAF, code_TERM, code_NODE };
63 
64 struct item {
65   unsigned short word;
66   int count;
67   char type;
68   struct item* left;
69   struct item* right;
70 };
71 
fail(const char * err,const char * par)72 int fail(const char* err, const char* par) {
73   fprintf(stderr, err, par);
74   return 1;
75 }
76 
code2table(struct item * tree,char ** table,char * code,int deep)77 void code2table(struct item* tree, char** table, char* code, int deep) {
78   int first = 0;
79   if (!code) {
80     first = 1;
81     code = (char*)malloc(CODELEN);
82   }
83   code[deep] = '1';
84   if (tree->left)
85     code2table(tree->left, table, code, deep + 1);
86   if (tree->type != code_NODE) {
87     int i = tree->word;
88     code[deep] = '\0';
89     if (tree->type == code_TERM)
90       i = CODELEN; /* terminal code */
91     table[i] = (char*)malloc(deep + 1);
92     strcpy(table[i], code);
93   }
94   code[deep] = '0';
95   if (tree->right)
96     code2table(tree->right, table, code, deep + 1);
97   if (first)
98     free(code);
99 }
100 
newitem(int c,struct item * l,struct item * r,int t)101 struct item* newitem(int c, struct item* l, struct item* r, int t) {
102   struct item* ni = (struct item*)malloc(sizeof(struct item));
103   ni->type = t;
104   ni->word = 0;
105   ni->count = c;
106   ni->left = l;
107   ni->right = r;
108   return ni;
109 }
110 
111 /* return length of the freq array */
get_freqdata(struct item *** dest,FILE * f,unsigned short * termword)112 int get_freqdata(struct item*** dest, FILE* f, unsigned short* termword) {
113   int freq[CODELEN];
114   int i, j, k, n;
115   union {
116     char c[2];
117     unsigned short word;
118   } u;
119   for (i = 0; i < CODELEN; i++)
120     freq[i] = 0;
121   while ((j = getc(f)) != -1 && (k = getc(f)) != -1) {
122     u.c[0] = j;
123     u.c[1] = k;
124     freq[u.word]++;
125   }
126   if (j != -1) {
127     u.c[0] = 1;
128     u.c[1] = j;
129   } else {
130     u.c[0] = 0;
131     u.c[1] = 0;
132   }
133 
134   *dest = (struct item**)malloc((CODELEN + 1) * sizeof(struct item*));
135   if (!*dest)
136     return -1;
137   for (i = 0, n = 0; i < CODELEN; i++)
138     if (freq[i]) {
139       (*dest)[n] = newitem(freq[i], NULL, NULL, code_LEAF);
140       (*dest)[n]->word = i;
141       n++;
142     }
143   /* terminal sequence (also contains the last odd byte of the file) */
144   (*dest)[n] = newitem(1, NULL, NULL, code_TERM);
145   *termword = u.word;
146   return n + 1;
147 }
148 
get_codetable(struct item ** l,int n,char ** table)149 void get_codetable(struct item** l, int n, char** table) {
150   int i;
151   while (n > 1) {
152     int min = 0;
153     int mi2 = 1;
154     for (i = 1; i < n; i++) {
155       if (l[i]->count < l[min]->count) {
156         mi2 = min;
157         min = i;
158       } else if (l[i]->count < l[mi2]->count)
159         mi2 = i;
160     }
161     l[min] = newitem(l[min]->count + l[mi2]->count, l[min], l[mi2], code_NODE);
162     for (i = mi2 + 1; i < n; i++)
163       l[i - 1] = l[i];
164     n--;
165   }
166   code2table(l[0], table, NULL, 0);
167 }
168 
write_bits(FILE * f,char * bitbuf,int * bits,char * code)169 int write_bits(FILE* f, char* bitbuf, int* bits, char* code) {
170   while (*code) {
171     int b = (*bits) % 8;
172     if (!b)
173       bitbuf[(*bits) / 8] = ((*code) - '0') << 7;
174     else
175       bitbuf[(*bits) / 8] |= (((*code) - '0') << (7 - b));
176     (*bits)++;
177     code++;
178     if (*bits == BUFSIZE * 8) {
179       if (BUFSIZE != fwrite(bitbuf, 1, BUFSIZE, f))
180         return 1;
181       *bits = 0;
182     }
183   }
184   return 0;
185 }
186 
encode_file(char ** table,int n,FILE * f,FILE * f2,unsigned short tw,char * key)187 int encode_file(char** table,
188                 int n,
189                 FILE* f,
190                 FILE* f2,
191                 unsigned short tw,
192                 char* key) {
193   char bitbuf[BUFSIZE];
194   int i, bits = 0;
195   unsigned char cl, ch;
196   int cx[2];
197   union {
198     char c[2];
199     unsigned short word;
200   } u;
201   char* enc = key;
202 
203   /* header and codes */
204   fprintf(f2, "%s", (key ? MAGIC_ENCRYPTED : MAGIC)); /* 3-byte HEADER */
205   cl = (unsigned char)(n & 0x00ff);
206   ch = (unsigned char)(n >> 8);
207   if (key) {
208     unsigned char cs;
209     for (cs = 0; *enc; enc++)
210       cs ^= *enc;
211     fprintf(f2, "%c", cs); /* 1-byte check sum */
212     enc = key;
213     ch ^= *enc;
214     if ((*(++enc)) == '\0')
215       enc = key;
216     cl ^= *enc;
217   }
218   fprintf(f2, "%c%c", ch, cl); /* upper and lower byte of record count */
219   for (i = 0; i < BUFSIZE; i++)
220     bitbuf[i] = '\0';
221   for (i = 0; i < CODELEN + 1; i++)
222     if (table[i]) {
223       size_t nmemb;
224       u.word = (unsigned short)i;
225       if (i == CODELEN)
226         u.word = tw;
227       if (key) {
228         if (*(++enc) == '\0')
229           enc = key;
230         u.c[0] ^= *enc;
231         if (*(++enc) == '\0')
232           enc = key;
233         u.c[1] ^= *enc;
234       }
235       fprintf(f2, "%c%c", u.c[0], u.c[1]); /* 2-character code id */
236       bits = 0;
237       if (write_bits(f2, bitbuf, &bits, table[i]) != 0)
238         return 1;
239       if (key) {
240         if (*(++enc) == '\0')
241           enc = key;
242         fprintf(f2, "%c", ((unsigned char)bits) ^ *enc);
243         for (cl = 0; cl <= bits / 8; cl++) {
244           if (*(++enc) == '\0')
245             enc = key;
246           bitbuf[cl] ^= *enc;
247         }
248       } else
249         fprintf(f2, "%c", (unsigned char)bits); /* 1-byte code length */
250       nmemb = bits / 8 + 1;
251       if (fwrite(bitbuf, 1, bits / 8 + 1, f2) != nmemb) /* x-byte code */
252         return 1;
253     }
254 
255   /* file encoding */
256   bits = 0;
257   while ((cx[0] = getc(f)) != -1 && (cx[1] = getc(f)) != -1) {
258     u.c[0] = cx[0];
259     u.c[1] = cx[1];
260     if (write_bits(f2, bitbuf, &bits, table[u.word]) != 0)
261       return 1;
262   }
263   /* terminal suffixes */
264   if (write_bits(f2, bitbuf, &bits, table[CODELEN]) != 0)
265     return 1;
266   if (bits > 0) {
267     size_t nmemb = bits / 8 + 1;
268     if (fwrite(bitbuf, 1, nmemb, f2) != nmemb)
269       return 1;
270   }
271   return 0;
272 }
273 
prefixcompress(FILE * f,FILE * tempfile)274 int prefixcompress(FILE* f, FILE* tempfile) {
275   char buf[BUFSIZE];
276   char buf2[BUFSIZE * 2];
277   char prev[BUFSIZE];
278   int prevlen = 0;
279   while (fgets(buf, BUFSIZE, f)) {
280     int i, j, k, m, c = 0;
281     int pfx = prevlen;
282     char* p = buf2;
283     m = j = 0;
284     for (i = 0; buf[i]; i++) {
285       if ((pfx > 0) && (buf[i] == prev[i])) {
286         j++;
287       } else
288         pfx = 0;
289     }
290     if (i > 0 && buf[i - 1] == '\n') {
291       if (j == i)
292         j--; /* line duplicate */
293       if (j > 29)
294         j = 29;
295       c = j;
296       if (c == '\t')
297         c = 30;
298       /* common suffix */
299       for (; (m < i - j - 1) && (m < 15) && (prevlen - m - 2 >= 0) &&
300              buf[i - m - 2] == prev[prevlen - m - 2];
301            m++)
302         ;
303       if (m == 1)
304         m = 0;
305     } else {
306       j = 0;
307       m = -1;
308     }
309     for (k = j; k < i - m - 1; k++, p++) {
310       if (((unsigned char)buf[k]) < 47 && buf[k] != '\t' && buf[k] != ' ') {
311         *p = ESCAPE;
312         p++;
313       }
314       *p = buf[k];
315     }
316     if (m > 0) {
317       *p = m + 31; /* 33-46 */
318       p++;
319     }
320     if (i > 0 && buf[i - 1] == '\n') {
321       size_t nmemb = p - buf2 + 1;
322       *p = c;
323       if (fwrite(buf2, 1, nmemb, tempfile) != nmemb)
324         return 1;
325     } else {
326       size_t nmemb = p - buf2;
327       if (fwrite(buf2, 1, nmemb, tempfile) != nmemb)
328         return 1;
329     }
330     memcpy(prev, buf, i);
331     prevlen = i;
332   }
333   return 0;
334 }
335 
hzip(const char * filename,char * key)336 int hzip(const char* filename, char* key) {
337   struct item** list;
338   char* table[CODELEN + 1];
339   int n;
340   unsigned short termword;
341 
342   FILE* f = fopen(filename, "r");
343   if (!f)
344     return fail("hzip: %s: Permission denied\n", filename);
345 
346   char tmpfiletemplate[] = "/tmp/hunspellXXXXXX";
347   mode_t mask = umask(S_IXUSR | S_IRWXG | S_IRWXO);
348   int tempfileno = mkstemp(tmpfiletemplate);
349   umask(mask);
350   if (tempfileno == -1) {
351     fclose(f);
352     return fail("hzip: cannot create temporary file\n", NULL);
353   }
354 
355   FILE *tempfile = fdopen(tempfileno, "rw");
356   if (!tempfile) {
357     close(tempfileno);
358     unlink(tmpfiletemplate);
359     fclose(f);
360     return fail("hzip: cannot create temporary file\n", NULL);
361   }
362 
363   std::string out(filename);
364   out.append(EXTENSION);
365   FILE* f2 = fopen(out.c_str(), "wb");
366   if (!f2) {
367     fclose(tempfile);
368     fclose(f);
369     unlink(tmpfiletemplate);
370     return fail("hzip: %s: Permission denied\n", out.c_str());
371   }
372   for (n = 0; n < CODELEN; n++)
373     table[n] = NULL;
374   if (prefixcompress(f, tempfile) != 0) {
375     fclose(f2);
376     fclose(tempfile);
377     fclose(f);
378     unlink(tmpfiletemplate);
379     return fail("hzip: cannot write file\n", NULL);
380   }
381   rewind(tempfile);
382   n = get_freqdata(&list, tempfile, &termword);
383   get_codetable(list, n, table);
384   rewind(tempfile);
385   n = encode_file(table, n, tempfile, f2, termword, key);
386   free(list);
387   fclose(f2);
388   fclose(tempfile);
389   fclose(f);
390   unlink(tmpfiletemplate);
391   if (n != 0)
392     return fail("hzip: cannot write file\n", NULL);
393   return n;
394 }
395 
main(int argc,char ** argv)396 int main(int argc, char** argv) {
397   int i, j = 0;
398   char* key = NULL;
399   for (i = 1; i < argc; i++) {
400     if (*(argv[i]) == '-') {
401       if (*(argv[i] + 1) == 'h')
402         return fail(DESC, NULL);
403       if (*(argv[i] + 1) == 'P') {
404         if (i + 1 == argc)
405           return fail("hzip: missing password\n", NULL);
406         key = argv[i + 1];
407         i++;
408         continue;
409       }
410       return fail("hzip: no such option: %s\n", argv[i]);
411     } else if (hzip(argv[i], key) != 0)
412       return 1;
413     else
414       j = 1;
415   }
416   if (j == 0)
417     return fail("hzip: need a filename parameter\n", NULL);
418   return 0;
419 }
420