1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * Copyright (C) 2002-2017 Németh László
5 *
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
10 *
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
15 *
16 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17 *
18 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23 *
24 * Alternatively, the contents of this file may be used under the terms of
25 * either the GNU General Public License Version 2 or later (the "GPL"), or
26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
35 *
36 * ***** END LICENSE BLOCK ***** */
37
38 /* hzip: file compression for sorted dictionaries with optional encryption,
39 * algorithm: prefix-suffix encoding and 16-bit Huffman encoding */
40
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <unistd.h>
45 #include <string>
46 #include <sys/stat.h>
47
48 #define CODELEN 65536
49 #define BUFSIZE 65536
50 #define EXTENSION ".hz"
51
52 #define ESCAPE 31
53 #define MAGIC "hz0"
54 #define MAGIC_ENCRYPTED "hz1"
55
56 #define DESC \
57 "hzip - dictionary compression utility\n" \
58 "Usage: hzip [-h | -P password ] [file1 file2 ..]\n" \
59 " -P password encrypted compression\n" \
60 " -h display this help and exit\n"
61
62 enum { code_LEAF, code_TERM, code_NODE };
63
64 struct item {
65 unsigned short word;
66 int count;
67 char type;
68 struct item* left;
69 struct item* right;
70 };
71
fail(const char * err,const char * par)72 int fail(const char* err, const char* par) {
73 fprintf(stderr, err, par);
74 return 1;
75 }
76
code2table(struct item * tree,char ** table,char * code,int deep)77 void code2table(struct item* tree, char** table, char* code, int deep) {
78 int first = 0;
79 if (!code) {
80 first = 1;
81 code = (char*)malloc(CODELEN);
82 }
83 code[deep] = '1';
84 if (tree->left)
85 code2table(tree->left, table, code, deep + 1);
86 if (tree->type != code_NODE) {
87 int i = tree->word;
88 code[deep] = '\0';
89 if (tree->type == code_TERM)
90 i = CODELEN; /* terminal code */
91 table[i] = (char*)malloc(deep + 1);
92 strcpy(table[i], code);
93 }
94 code[deep] = '0';
95 if (tree->right)
96 code2table(tree->right, table, code, deep + 1);
97 if (first)
98 free(code);
99 }
100
newitem(int c,struct item * l,struct item * r,int t)101 struct item* newitem(int c, struct item* l, struct item* r, int t) {
102 struct item* ni = (struct item*)malloc(sizeof(struct item));
103 ni->type = t;
104 ni->word = 0;
105 ni->count = c;
106 ni->left = l;
107 ni->right = r;
108 return ni;
109 }
110
111 /* return length of the freq array */
get_freqdata(struct item *** dest,FILE * f,unsigned short * termword)112 int get_freqdata(struct item*** dest, FILE* f, unsigned short* termword) {
113 int freq[CODELEN];
114 int i, j, k, n;
115 union {
116 char c[2];
117 unsigned short word;
118 } u;
119 for (i = 0; i < CODELEN; i++)
120 freq[i] = 0;
121 while ((j = getc(f)) != -1 && (k = getc(f)) != -1) {
122 u.c[0] = j;
123 u.c[1] = k;
124 freq[u.word]++;
125 }
126 if (j != -1) {
127 u.c[0] = 1;
128 u.c[1] = j;
129 } else {
130 u.c[0] = 0;
131 u.c[1] = 0;
132 }
133
134 *dest = (struct item**)malloc((CODELEN + 1) * sizeof(struct item*));
135 if (!*dest)
136 return -1;
137 for (i = 0, n = 0; i < CODELEN; i++)
138 if (freq[i]) {
139 (*dest)[n] = newitem(freq[i], NULL, NULL, code_LEAF);
140 (*dest)[n]->word = i;
141 n++;
142 }
143 /* terminal sequence (also contains the last odd byte of the file) */
144 (*dest)[n] = newitem(1, NULL, NULL, code_TERM);
145 *termword = u.word;
146 return n + 1;
147 }
148
get_codetable(struct item ** l,int n,char ** table)149 void get_codetable(struct item** l, int n, char** table) {
150 int i;
151 while (n > 1) {
152 int min = 0;
153 int mi2 = 1;
154 for (i = 1; i < n; i++) {
155 if (l[i]->count < l[min]->count) {
156 mi2 = min;
157 min = i;
158 } else if (l[i]->count < l[mi2]->count)
159 mi2 = i;
160 }
161 l[min] = newitem(l[min]->count + l[mi2]->count, l[min], l[mi2], code_NODE);
162 for (i = mi2 + 1; i < n; i++)
163 l[i - 1] = l[i];
164 n--;
165 }
166 code2table(l[0], table, NULL, 0);
167 }
168
write_bits(FILE * f,char * bitbuf,int * bits,char * code)169 int write_bits(FILE* f, char* bitbuf, int* bits, char* code) {
170 while (*code) {
171 int b = (*bits) % 8;
172 if (!b)
173 bitbuf[(*bits) / 8] = ((*code) - '0') << 7;
174 else
175 bitbuf[(*bits) / 8] |= (((*code) - '0') << (7 - b));
176 (*bits)++;
177 code++;
178 if (*bits == BUFSIZE * 8) {
179 if (BUFSIZE != fwrite(bitbuf, 1, BUFSIZE, f))
180 return 1;
181 *bits = 0;
182 }
183 }
184 return 0;
185 }
186
encode_file(char ** table,int n,FILE * f,FILE * f2,unsigned short tw,char * key)187 int encode_file(char** table,
188 int n,
189 FILE* f,
190 FILE* f2,
191 unsigned short tw,
192 char* key) {
193 char bitbuf[BUFSIZE];
194 int i, bits = 0;
195 unsigned char cl, ch;
196 int cx[2];
197 union {
198 char c[2];
199 unsigned short word;
200 } u;
201 char* enc = key;
202
203 /* header and codes */
204 fprintf(f2, "%s", (key ? MAGIC_ENCRYPTED : MAGIC)); /* 3-byte HEADER */
205 cl = (unsigned char)(n & 0x00ff);
206 ch = (unsigned char)(n >> 8);
207 if (key) {
208 unsigned char cs;
209 for (cs = 0; *enc; enc++)
210 cs ^= *enc;
211 fprintf(f2, "%c", cs); /* 1-byte check sum */
212 enc = key;
213 ch ^= *enc;
214 if ((*(++enc)) == '\0')
215 enc = key;
216 cl ^= *enc;
217 }
218 fprintf(f2, "%c%c", ch, cl); /* upper and lower byte of record count */
219 for (i = 0; i < BUFSIZE; i++)
220 bitbuf[i] = '\0';
221 for (i = 0; i < CODELEN + 1; i++)
222 if (table[i]) {
223 size_t nmemb;
224 u.word = (unsigned short)i;
225 if (i == CODELEN)
226 u.word = tw;
227 if (key) {
228 if (*(++enc) == '\0')
229 enc = key;
230 u.c[0] ^= *enc;
231 if (*(++enc) == '\0')
232 enc = key;
233 u.c[1] ^= *enc;
234 }
235 fprintf(f2, "%c%c", u.c[0], u.c[1]); /* 2-character code id */
236 bits = 0;
237 if (write_bits(f2, bitbuf, &bits, table[i]) != 0)
238 return 1;
239 if (key) {
240 if (*(++enc) == '\0')
241 enc = key;
242 fprintf(f2, "%c", ((unsigned char)bits) ^ *enc);
243 for (cl = 0; cl <= bits / 8; cl++) {
244 if (*(++enc) == '\0')
245 enc = key;
246 bitbuf[cl] ^= *enc;
247 }
248 } else
249 fprintf(f2, "%c", (unsigned char)bits); /* 1-byte code length */
250 nmemb = bits / 8 + 1;
251 if (fwrite(bitbuf, 1, bits / 8 + 1, f2) != nmemb) /* x-byte code */
252 return 1;
253 }
254
255 /* file encoding */
256 bits = 0;
257 while ((cx[0] = getc(f)) != -1 && (cx[1] = getc(f)) != -1) {
258 u.c[0] = cx[0];
259 u.c[1] = cx[1];
260 if (write_bits(f2, bitbuf, &bits, table[u.word]) != 0)
261 return 1;
262 }
263 /* terminal suffixes */
264 if (write_bits(f2, bitbuf, &bits, table[CODELEN]) != 0)
265 return 1;
266 if (bits > 0) {
267 size_t nmemb = bits / 8 + 1;
268 if (fwrite(bitbuf, 1, nmemb, f2) != nmemb)
269 return 1;
270 }
271 return 0;
272 }
273
prefixcompress(FILE * f,FILE * tempfile)274 int prefixcompress(FILE* f, FILE* tempfile) {
275 char buf[BUFSIZE];
276 char buf2[BUFSIZE * 2];
277 char prev[BUFSIZE];
278 int prevlen = 0;
279 while (fgets(buf, BUFSIZE, f)) {
280 int i, j, k, m, c = 0;
281 int pfx = prevlen;
282 char* p = buf2;
283 m = j = 0;
284 for (i = 0; buf[i]; i++) {
285 if ((pfx > 0) && (buf[i] == prev[i])) {
286 j++;
287 } else
288 pfx = 0;
289 }
290 if (i > 0 && buf[i - 1] == '\n') {
291 if (j == i)
292 j--; /* line duplicate */
293 if (j > 29)
294 j = 29;
295 c = j;
296 if (c == '\t')
297 c = 30;
298 /* common suffix */
299 for (; (m < i - j - 1) && (m < 15) && (prevlen - m - 2 >= 0) &&
300 buf[i - m - 2] == prev[prevlen - m - 2];
301 m++)
302 ;
303 if (m == 1)
304 m = 0;
305 } else {
306 j = 0;
307 m = -1;
308 }
309 for (k = j; k < i - m - 1; k++, p++) {
310 if (((unsigned char)buf[k]) < 47 && buf[k] != '\t' && buf[k] != ' ') {
311 *p = ESCAPE;
312 p++;
313 }
314 *p = buf[k];
315 }
316 if (m > 0) {
317 *p = m + 31; /* 33-46 */
318 p++;
319 }
320 if (i > 0 && buf[i - 1] == '\n') {
321 size_t nmemb = p - buf2 + 1;
322 *p = c;
323 if (fwrite(buf2, 1, nmemb, tempfile) != nmemb)
324 return 1;
325 } else {
326 size_t nmemb = p - buf2;
327 if (fwrite(buf2, 1, nmemb, tempfile) != nmemb)
328 return 1;
329 }
330 memcpy(prev, buf, i);
331 prevlen = i;
332 }
333 return 0;
334 }
335
hzip(const char * filename,char * key)336 int hzip(const char* filename, char* key) {
337 struct item** list;
338 char* table[CODELEN + 1];
339 int n;
340 unsigned short termword;
341
342 FILE* f = fopen(filename, "r");
343 if (!f)
344 return fail("hzip: %s: Permission denied\n", filename);
345
346 char tmpfiletemplate[] = "/tmp/hunspellXXXXXX";
347 mode_t mask = umask(S_IXUSR | S_IRWXG | S_IRWXO);
348 int tempfileno = mkstemp(tmpfiletemplate);
349 umask(mask);
350 if (tempfileno == -1) {
351 fclose(f);
352 return fail("hzip: cannot create temporary file\n", NULL);
353 }
354
355 FILE *tempfile = fdopen(tempfileno, "rw");
356 if (!tempfile) {
357 close(tempfileno);
358 unlink(tmpfiletemplate);
359 fclose(f);
360 return fail("hzip: cannot create temporary file\n", NULL);
361 }
362
363 std::string out(filename);
364 out.append(EXTENSION);
365 FILE* f2 = fopen(out.c_str(), "wb");
366 if (!f2) {
367 fclose(tempfile);
368 fclose(f);
369 unlink(tmpfiletemplate);
370 return fail("hzip: %s: Permission denied\n", out.c_str());
371 }
372 for (n = 0; n < CODELEN; n++)
373 table[n] = NULL;
374 if (prefixcompress(f, tempfile) != 0) {
375 fclose(f2);
376 fclose(tempfile);
377 fclose(f);
378 unlink(tmpfiletemplate);
379 return fail("hzip: cannot write file\n", NULL);
380 }
381 rewind(tempfile);
382 n = get_freqdata(&list, tempfile, &termword);
383 get_codetable(list, n, table);
384 rewind(tempfile);
385 n = encode_file(table, n, tempfile, f2, termword, key);
386 free(list);
387 fclose(f2);
388 fclose(tempfile);
389 fclose(f);
390 unlink(tmpfiletemplate);
391 if (n != 0)
392 return fail("hzip: cannot write file\n", NULL);
393 return n;
394 }
395
main(int argc,char ** argv)396 int main(int argc, char** argv) {
397 int i, j = 0;
398 char* key = NULL;
399 for (i = 1; i < argc; i++) {
400 if (*(argv[i]) == '-') {
401 if (*(argv[i] + 1) == 'h')
402 return fail(DESC, NULL);
403 if (*(argv[i] + 1) == 'P') {
404 if (i + 1 == argc)
405 return fail("hzip: missing password\n", NULL);
406 key = argv[i + 1];
407 i++;
408 continue;
409 }
410 return fail("hzip: no such option: %s\n", argv[i]);
411 } else if (hzip(argv[i], key) != 0)
412 return 1;
413 else
414 j = 1;
415 }
416 if (j == 0)
417 return fail("hzip: need a filename parameter\n", NULL);
418 return 0;
419 }
420