1 /* #define DEBUG */
2 #include <stdio.h>
3 #include "sqcom.h"
4 #include "sq.h"
5
6 #ifdef TOPS20
7 #include <libt20.h>
8 #endif
9
10 #define ERROR -1
11 #define TRUE 1
12 #define FALSE 0
13
14 /******** Second translation - bytes to variable length bit strings *********/
15
16
17 /* This translation uses the Huffman algorithm to develop a
18 * binary tree representing the decoding information for
19 * a variable length bit string code for each input value.
20 * Each string's length is in inverse proportion to its
21 * frequency of appearance in the incoming data stream.
22 * The encoding table is derived from the decoding table.
23 *
24 * The range of valid values into the Huffman algorithm are
25 * the values of a byte stored in an integer plus the special
26 * endfile value chosen to be an adjacent value. Overall, 0-SPEOF.
27 *
28 * The "node" array of structures contains the nodes of the
29 * binary tree. The first NUMVALS nodes are the leaves of the
30 * tree and represent the values of the data bytes being
31 * encoded and the special endfile, SPEOF.
32 * The remaining nodes become the internal nodes of the tree.
33 *
34 * In the original design it was believed that
35 * a Huffman code would fit in the same number of
36 * bits that will hold the sum of all the counts.
37 * That was disproven by a user's file and was a rare but
38 * infamous bug. This version attempts to choose among equally
39 * weighted subtrees according to their maximum depths to avoid
40 * unnecessarily long codes. In case that is not sufficient
41 * to guarantee codes <= 16 bits long, we initially scale
42 * the counts so the total fits in an unsigned integer, but
43 * if codes longer than 16 bits are generated the counts are
44 * rescaled to a lower ceiling and code generation is retried.
45 */
46
47 /* Initialize the Huffman translation. This requires reading
48 * the input file through any preceding translation functions
49 * to get the frequency distribution of the various values.
50 */
51
init_huff(ib)52 init_huff(ib)
53 FILE *ib;
54 {
55 int c, i;
56 int btlist[NUMVALS]; /* list of intermediate binary trees */
57 int listlen; /* length of btlist */
58 unsigned int *wp; /* simplifies weight counting */
59 unsigned int ceiling; /* limit for scaling */
60
61 /* Initialize tree nodes to no weight, no children */
62 init_tree();
63
64 /* Build frequency info in tree */
65 do {
66 c = getcnr(ib);
67 if(c == EOF)
68 c = SPEOF;
69 if(*(wp = &node[c].weight) != MAXCOUNT)
70 ++(*wp);
71 } while(c != SPEOF);
72 #ifdef DEBUG
73 pcounts(); /* debugging aid */
74 #endif
75 ceiling = MAXCOUNT;
76
77 do { /* Keep trying to scale and encode */
78 if(ceiling != MAXCOUNT) {
79 printf("*** rescaling ***, ");
80 #ifdef TOPS20
81 fflush (stdout);
82 #endif
83 }
84 scale(ceiling);
85 ceiling /= 2; /* in case we rescale */
86 #ifdef DEBUG
87 pcounts(); /* debugging aid */
88 #endif
89
90 /* Build list of single node binary trees having
91 * leaves for the input values with non-zero counts
92 */
93 for(i = listlen = 0; i < NUMVALS; ++i)
94 if(node[i].weight != 0) {
95 node[i].tdepth = 0;
96 btlist[listlen++] = i;
97 }
98
99 /* Arrange list of trees into a heap with the entry
100 * indexing the node with the least weight at the top.
101 */
102 heap(btlist, listlen);
103
104 /* Convert the list of trees to a single decoding tree */
105 bld_tree(btlist, listlen);
106
107 /* Initialize the encoding table */
108 init_enc();
109
110 /* Try to build encoding table.
111 * Fail if any code is > 16 bits long.
112 */
113 } while(buildenc(0, dctreehd) == ERROR);
114 #ifdef DEBUG
115 phuff(); /* debugging aid */
116 #endif
117 /* Initialize encoding variables */
118 cbitsrem = 0; /*force initial read */
119 curin = 0; /*anything but endfile*/
120 }
121 /* */
122 /* The count of number of occurrances of each input value
123 * have already been prevented from exceeding MAXCOUNT.
124 * Now we must scale them so that their sum doesn't exceed
125 * ceiling and yet no non-zero count can become zero.
126 * This scaling prevents errors in the weights of the
127 * interior nodes of the Huffman tree and also ensures that
128 * the codes will fit in an unsigned integer. Rescaling is
129 * used if necessary to limit the code length.
130 */
131
scale(ceil)132 scale(ceil)
133 unsigned int ceil; /* upper limit on total weight */
134 {
135 int c, ovflw, divisor, i;
136 unsigned int w, sum;
137 char increased; /* flag */
138
139 do {
140 for(i = sum = ovflw = 0; i < NUMVALS; ++i) {
141 if(node[i].weight > (ceil - sum))
142 ++ovflw;
143 sum += node[i].weight;
144 }
145
146 divisor = ovflw + 1;
147
148 /* Ensure no non-zero values are lost */
149 increased = FALSE;
150 for(i = 0; i < NUMVALS; ++i) {
151 w = node[i].weight;
152 if (w < divisor && w > 0) {
153 /* Don't fail to provide a code if it's used at all */
154 node[i].weight = divisor;
155 increased = TRUE;
156 }
157 }
158 } while(increased);
159
160 /* Scaling factor choosen, now scale */
161 if(divisor > 1)
162 for(i = 0; i < NUMVALS; ++i)
163 node[i].weight /= divisor;
164 }
165 /* */
166 /* heap() and adjust() maintain a list of binary trees as a
167 * heap with the top indexing the binary tree on the list
168 * which has the least weight or, in case of equal weights,
169 * least depth in its longest path. The depth part is not
170 * strictly necessary, but tends to avoid long codes which
171 * might provoke rescaling.
172 */
173
heap(list,length)174 heap(list, length)
175 int list[], length;
176 {
177 int i;
178
179 for(i = (length - 2) / 2; i >= 0; --i)
180 adjust(list, i, length - 1);
181 }
182
183 /* Make a heap from a heap with a new top */
184
adjust(list,top,bottom)185 adjust(list, top, bottom)
186 int list[], top, bottom;
187 {
188 int k, temp;
189 char cmptrees();
190
191 k = 2 * top + 1; /* left child of top */
192 temp = list[top]; /* remember root node of top tree */
193 if( k <= bottom) {
194 if( k < bottom && cmptrees(list[k], list[k + 1]))
195 ++k;
196
197 /* k indexes "smaller" child (in heap of trees) of top */
198 /* now make top index "smaller" of old top and smallest child */
199 if(cmptrees(temp, list[k])) {
200 list[top] = list[k];
201 list[k] = temp;
202 /* Make the changed list a heap */
203 adjust(list, k, bottom); /*recursive*/
204 }
205 }
206 }
207
208 /* Compare two trees, if a > b return true, else return false
209 * note comparison rules in previous comments.
210 */
211
212 char /* Boolean */
cmptrees(a,b)213 cmptrees(a, b)
214 int a, b; /* root nodes of trees */
215 {
216 if(node[a].weight > node[b].weight)
217 return (TRUE);
218 if(node[a].weight == node[b].weight)
219 if(node[a].tdepth > node[b].tdepth)
220 return (TRUE);
221 return (FALSE);
222 }
223 /* */
224 /* HUFFMAN ALGORITHM: develops the single element trees
225 * into a single binary tree by forming subtrees rooted in
226 * interior nodes having weights equal to the sum of weights of all
227 * their descendents and having depth counts indicating the
228 * depth of their longest paths.
229 *
230 * When all trees have been formed into a single tree satisfying
231 * the heap property (on weight, with depth as a tie breaker)
232 * then the binary code assigned to a leaf (value to be encoded)
233 * is then the series of left (0) and right (1)
234 * paths leading from the root to the leaf.
235 * Note that trees are removed from the heaped list by
236 * moving the last element over the top element and
237 * reheaping the shorter list.
238 */
239
bld_tree(list,len)240 bld_tree(list, len)
241 int list[];
242 int len;
243 {
244 int freenode; /* next free node in tree */
245 int lch, rch; /* temporaries for left, right children */
246 struct nd *frnp; /* free node pointer */
247 int i;
248 char maxchar();
249
250 /* Initialize index to next available (non-leaf) node.
251 * Lower numbered nodes correspond to leaves (data values).
252 */
253 freenode = NUMVALS;
254
255 while(len > 1) {
256 /* Take from list two btrees with least weight
257 * and build an interior node pointing to them.
258 * This forms a new tree.
259 */
260 lch = list[0]; /* This one will be left child */
261
262 /* delete top (least) tree from the list of trees */
263 list[0] = list[--len];
264 adjust(list, 0, len - 1);
265
266 /* Take new top (least) tree. Reuse list slot later */
267 rch = list[0]; /* This one will be right child */
268
269 /* Form new tree from the two least trees using
270 * a free node as root. Put the new tree in the list.
271 */
272 frnp = &node[freenode]; /* address of next free node */
273 list[0] = freenode++; /* put at top for now */
274 frnp->lchild = lch;
275 frnp->rchild = rch;
276 frnp->weight = node[lch].weight + node[rch].weight;
277 frnp->tdepth = 1 + maxchar(node[lch].tdepth, node[rch].tdepth);
278 /* reheap list to get least tree at top*/
279 adjust(list, 0, len - 1);
280 }
281 dctreehd = list[0]; /*head of final tree */
282 }
283 /* */
284 char
maxchar(a,b)285 maxchar(a, b)
286 char a, b;
287 {
288 return (a > b ? a : b);
289 }
290 /* Initialize all nodes to single element binary trees
291 * with zero weight and depth.
292 */
293
init_tree()294 init_tree()
295 {
296 int i;
297
298 for(i = 0; i < NUMNODES; ++i) {
299 node[i].weight = 0;
300 node[i].tdepth = 0;
301 node[i].lchild = NOCHILD;
302 node[i].rchild = NOCHILD;
303 }
304 }
305
init_enc()306 init_enc()
307 {
308 int i;
309
310 /* Initialize encoding table */
311 for(i = 0; i < NUMVALS; ++i) {
312 codelen[i] = 0;
313 }
314 }
315 /* */
316 /* Recursive routine to walk the indicated subtree and level
317 * and maintain the current path code in bstree. When a leaf
318 * is found the entire code string and length are put into
319 * the encoding table entry for the leaf's data value.
320 *
321 * Returns ERROR if codes are too long.
322 */
323
324 int /* returns ERROR or NULL */
buildenc(level,root)325 buildenc(level, root)
326 int level;/* level of tree being examined, from zero */
327 int root; /* root of subtree is also data value if leaf */
328 {
329 int l, r;
330
331 l = node[root].lchild;
332 r = node[root].rchild;
333 #ifdef DEBUG
334 if (debug) printf("level=%d,root=%d,l=%d,r=%d,tcode=%04x\n",level,root,l,r,tcode);
335 #endif
336 if( l == NOCHILD && r == NOCHILD) {
337 /* Leaf. Previous path determines bit string
338 * code of length level (bits 0 to level - 1).
339 * Ensures unused code bits are zero.
340 */
341 codelen[root] = level;
342 #ifdef TOPS20
343 code[root] = tcode & (65535 >> (16 - level));
344 #else
345 code[root] = tcode & ((unsigned int)(~0) >> ((sizeof(int) * 8) - level));
346 #endif
347 #ifdef DEBUG
348 if (debug) printf(" codelen[%d]=%d,code[%d]=%02x\n",root,codelen[root],root,code[root]);
349 #endif
350 return ((level > 16) ? ERROR : NULL);
351 } else {
352 if( l != NOCHILD) {
353 /* Clear path bit and continue deeper */
354 tcode &= ~(1 << level);
355 /* NOTE RECURSION */
356 if(buildenc(level + 1, l) == ERROR)
357 return (ERROR);
358 }
359 if(r != NOCHILD) {
360 /* Set path bit and continue deeper */
361 tcode |= 1 << level;
362 /* NOTE RECURSION */
363 if(buildenc(level + 1, r) == ERROR)
364 return (ERROR);
365 }
366 }
367 return (NULL); /* if we got here we're ok so far */
368 }
369 /* */
370 /* Write out the header of the compressed file */
371
372 #ifdef TOPS20
wrt_head(ob,req)373 wrt_head(ob, req)
374 FILE *ob;
375 struct filename *req;
376 #else
377 wrt_head(ob, infile)
378 FILE *ob;
379 char *infile; /* input file name (w/ or w/o drive) */
380 #endif
381 {
382 #ifdef TOPS20
383 char *q;
384 char fnbuf[100], lfname[40], lftype[40];
385 #endif
386 int i, k, l, r;
387 int numnodes; /* nbr of nodes in simplified tree */
388
389 putwe(RECOGNIZE, ob); /* identifies as compressed */
390 putwe(crc, ob); /* unsigned sum of original data */
391 #ifdef TOPS20
392 strcpy (lfname, req->nm);
393 strcpy (lftype, req->typ);
394 lower (lfname);
395 lower (lftype);
396 sprintf (fnbuf, "%.8s%s%.3s", lfname,
397 (req->typ && *req->typ) ? "." : "", lftype);
398 q = fnbuf;
399 do {
400 putce(*q, ob);
401 } while(*(q++) != '\0');
402 #else
403 /* Record the original file name w/o drive */
404 if(*(infile + 1) == ':')
405 infile += 2; /* skip drive */
406
407 do {
408 putce(*infile, ob);
409 } while(*(infile++) != '\0');
410 #endif
411
412 /* Write out a simplified decoding tree. Only the interior
413 * nodes are written. When a child is a leaf index
414 * (representing a data value) it is recoded as
415 * -(index + 1) to distinguish it from interior indexes
416 * which are recoded as positive indexes in the new tree.
417 * Note that this tree will be empty for an empty file.
418 */
419
420 numnodes = dctreehd < NUMVALS ? 0 : dctreehd - (NUMVALS -1);
421 putwe(numnodes, ob);
422
423 for(k = 0, i = dctreehd; k < numnodes; ++k, --i) {
424 l = node[i].lchild;
425 r = node[i].rchild;
426 l = l < NUMVALS ? -(l + 1) : dctreehd - l;
427 r = r < NUMVALS ? -(r + 1) : dctreehd - r;
428 putwe(l, ob); /* left child */
429 putwe(r, ob); /* right child */
430 }
431 }
432 /* */
433 /* Get an encoded byte or EOF. Reads from specified stream AS NEEDED.
434 *
435 * There are two unsynchronized bit-byte relationships here.
436 * The input stream bytes are converted to bit strings of
437 * various lengths via the static variables named c...
438 * These bit strings are concatenated without padding to
439 * become the stream of encoded result bytes, which this
440 * function returns one at a time. The EOF (end of file) is
441 * converted to SPEOF for convenience and encoded like any
442 * other input value. True EOF is returned after that.
443 *
444 * The original gethuff() called a seperate function,
445 * getbit(), but that more readable version was too slow.
446 */
447
448 int /* Returns byte values except for EOF */
gethuff(ib)449 gethuff(ib)
450 FILE *ib;
451 {
452 unsigned int rbyte; /* Result byte value */
453 char need, take; /* numbers of bits */
454
455 rbyte = 0;
456 need = 8; /* build one byte per call */
457
458 /* Loop to build a byte of encoded data
459 * Initialization forces read the first time
460 */
461
462 loop:
463 if(cbitsrem >= need) {
464 /* Current code fullfills our needs */
465 if(need == 0)
466 return (rbyte & 0xff);
467 /* Take what we need */
468 rbyte |= ccode << (8 - need);
469 /* And leave the rest */
470 ccode >>= need;
471 cbitsrem -= need;
472 return (rbyte & 0xff);
473 }
474
475 /* We need more than current code */
476 if(cbitsrem > 0) {
477 /* Take what there is */
478 rbyte |= ccode << (8 - need);
479 need -= cbitsrem;
480 }
481 /* No more bits in current code string */
482 if(curin == SPEOF) {
483 /* The end of file token has been encoded. If
484 * result byte has data return it and do EOF next time
485 */
486 cbitsrem = 0;
487
488 return ((need == 8) ? EOF : rbyte & 0xff);
489 }
490
491 /* Get an input byte */
492 if((curin = getcnr(ib)) == EOF)
493 curin = SPEOF; /* convenient for encoding */
494
495 /* Get the new byte's code */
496 ccode = code[curin];
497 cbitsrem = codelen[curin];
498
499 goto loop;
500 }
501