1 /* #define DEBUG */
2 #include <stdio.h>
3 #include "sqcom.h"
4 #include "sq.h"
5 
6 #ifdef TOPS20
7 #include <libt20.h>
8 #endif
9 
10 #define ERROR -1
11 #define TRUE 1
12 #define FALSE 0
13 
14 /******** Second translation - bytes to variable length bit strings *********/
15 
16 
17 /* This translation uses the Huffman algorithm to develop a
18  * binary tree representing the decoding information for
19  * a variable length bit string code for each input value.
20  * Each string's length is in inverse proportion to its
21  * frequency of appearance in the incoming data stream.
22  * The encoding table is derived from the decoding table.
23  *
24  * The range of valid values into the Huffman algorithm are
25  * the values of a byte stored in an integer plus the special
26  * endfile value chosen to be an adjacent value. Overall, 0-SPEOF.
27  *
28  * The "node" array of structures contains the nodes of the
29  * binary tree. The first NUMVALS nodes are the leaves of the
30  * tree and represent the values of the data bytes being
31  * encoded and the special endfile, SPEOF.
32  * The remaining nodes become the internal nodes of the tree.
33  *
34  * In the original design it was believed that
35  * a Huffman code would fit in the same number of
36  * bits that will hold the sum of all the counts.
37  * That was disproven by a user's file and was a rare but
38  * infamous bug. This version attempts to choose among equally
39  * weighted subtrees according to their maximum depths to avoid
40  * unnecessarily long codes. In case that is not sufficient
41  * to guarantee codes <= 16 bits long, we initially scale
42  * the counts so the total fits in an unsigned integer, but
43  * if codes longer than 16 bits are generated the counts are
44  * rescaled to a lower ceiling and code generation is retried.
45  */
46 
47 /* Initialize the Huffman translation. This requires reading
48  * the input file through any preceding translation functions
49  * to get the frequency distribution of the various values.
50  */
51 
init_huff(ib)52 init_huff(ib)
53 FILE *ib;
54 {
55 	int c, i;
56 	int btlist[NUMVALS];	/* list of intermediate binary trees */
57 	int listlen;		/* length of btlist */
58 	unsigned int *wp;	/* simplifies weight counting */
59 	unsigned int ceiling;	/* limit for scaling */
60 
61 	/* Initialize tree nodes to no weight, no children */
62 	init_tree();
63 
64 	/* Build frequency info in tree */
65 	do {
66 		c = getcnr(ib);
67 		if(c == EOF)
68 			c = SPEOF;
69 		if(*(wp = &node[c].weight) !=  MAXCOUNT)
70 			++(*wp);
71 	} while(c != SPEOF);
72 #ifdef DEBUG
73 	pcounts();	/* debugging aid */
74 #endif
75 	ceiling = MAXCOUNT;
76 
77 	do {	/* Keep trying to scale and encode */
78 		if(ceiling != MAXCOUNT) {
79 			printf("*** rescaling ***, ");
80 #ifdef TOPS20
81 			fflush (stdout);
82 #endif
83 		}
84 		scale(ceiling);
85 		ceiling /= 2;	/* in case we rescale */
86 #ifdef DEBUG
87 		pcounts();	/* debugging aid */
88 #endif
89 
90 		/* Build list of single node binary trees having
91 		 * leaves for the input values with non-zero counts
92 		 */
93 		for(i = listlen = 0; i < NUMVALS; ++i)
94 			if(node[i].weight != 0) {
95 				node[i].tdepth = 0;
96 				btlist[listlen++] = i;
97 			}
98 
99 		/* Arrange list of trees into a heap with the entry
100 		 * indexing the node with the least weight at the top.
101 		 */
102 		heap(btlist, listlen);
103 
104 		/* Convert the list of trees to a single decoding tree */
105 		bld_tree(btlist, listlen);
106 
107 		/* Initialize the encoding table */
108 		init_enc();
109 
110 		/* Try to build encoding table.
111 		 * Fail if any code is > 16 bits long.
112 		 */
113 	} while(buildenc(0, dctreehd) == ERROR);
114 #ifdef DEBUG
115 	phuff();	/* debugging aid */
116 #endif
117 	/* Initialize encoding variables */
118 	cbitsrem = 0;	/*force initial read */
119 	curin = 0;	/*anything but endfile*/
120 }
121 /*  */
122 /* The count of number of occurrances of each input value
123  * have already been prevented from exceeding MAXCOUNT.
124  * Now we must scale them so that their sum doesn't exceed
125  * ceiling and yet no non-zero count can become zero.
126  * This scaling prevents errors in the weights of the
127  * interior nodes of the Huffman tree and also ensures that
128  * the codes will fit in an unsigned integer. Rescaling is
129  * used if necessary to limit the code length.
130  */
131 
scale(ceil)132 scale(ceil)
133 unsigned int ceil;	/* upper limit on total weight */
134 {
135 	int c, ovflw, divisor, i;
136 	unsigned int w, sum;
137 	char increased;		/* flag */
138 
139 	do {
140 		for(i = sum = ovflw = 0; i < NUMVALS; ++i) {
141 			if(node[i].weight > (ceil - sum))
142 				++ovflw;
143 			sum += node[i].weight;
144 		}
145 
146 		divisor = ovflw + 1;
147 
148 		/* Ensure no non-zero values are lost */
149 		increased = FALSE;
150 		for(i = 0; i < NUMVALS; ++i) {
151 			w = node[i].weight;
152 			if (w < divisor && w > 0) {
153 				/* Don't fail to provide a code if it's used at all */
154 				node[i].weight = divisor;
155 				increased = TRUE;
156 			}
157 		}
158 	} while(increased);
159 
160 	/* Scaling factor choosen, now scale */
161 	if(divisor > 1)
162 		for(i = 0; i < NUMVALS; ++i)
163 			node[i].weight /= divisor;
164 }
165 /*  */
166 /* heap() and adjust() maintain a list of binary trees as a
167  * heap with the top indexing the binary tree on the list
168  * which has the least weight or, in case of equal weights,
169  * least depth in its longest path. The depth part is not
170  * strictly necessary, but tends to avoid long codes which
171  * might provoke rescaling.
172  */
173 
heap(list,length)174 heap(list, length)
175 int list[], length;
176 {
177 	int i;
178 
179 	for(i = (length - 2) / 2; i >= 0; --i)
180 		adjust(list, i, length - 1);
181 }
182 
183 /* Make a heap from a heap with a new top */
184 
adjust(list,top,bottom)185 adjust(list, top, bottom)
186 int list[], top, bottom;
187 {
188 	int k, temp;
189 	char cmptrees();
190 
191 	k = 2 * top + 1;	/* left child of top */
192 	temp = list[top];	/* remember root node of top tree */
193 	if( k <= bottom) {
194 		if( k < bottom && cmptrees(list[k], list[k + 1]))
195 			++k;
196 
197 		/* k indexes "smaller" child (in heap of trees) of top */
198 		/* now make top index "smaller" of old top and smallest child */
199 		if(cmptrees(temp, list[k])) {
200 			list[top] = list[k];
201 			list[k] = temp;
202 			/* Make the changed list a heap */
203 			adjust(list, k, bottom); /*recursive*/
204 		}
205 	}
206 }
207 
208 /* Compare two trees, if a > b return true, else return false
209  * note comparison rules in previous comments.
210  */
211 
212 char	/* Boolean */
cmptrees(a,b)213 cmptrees(a, b)
214 int a, b;	/* root nodes of trees */
215 {
216 	if(node[a].weight > node[b].weight)
217 		return (TRUE);
218 	if(node[a].weight == node[b].weight)
219 		if(node[a].tdepth > node[b].tdepth)
220 			return (TRUE);
221 	return (FALSE);
222 }
223 /*  */
224 /* HUFFMAN ALGORITHM: develops the single element trees
225  * into a single binary tree by forming subtrees rooted in
226  * interior nodes having weights equal to the sum of weights of all
227  * their descendents and having depth counts indicating the
228  * depth of their longest paths.
229  *
230  * When all trees have been formed into a single tree satisfying
231  * the heap property (on weight, with depth as a tie breaker)
232  * then the binary code assigned to a leaf (value to be encoded)
233  * is then the series of left (0) and right (1)
234  * paths leading from the root to the leaf.
235  * Note that trees are removed from the heaped list by
236  * moving the last element over the top element and
237  * reheaping the shorter list.
238  */
239 
bld_tree(list,len)240 bld_tree(list, len)
241 int list[];
242 int len;
243 {
244 	int freenode;		/* next free node in tree */
245 	int lch, rch;		/* temporaries for left, right children */
246 	struct nd *frnp;	/* free node pointer */
247 	int i;
248 	char maxchar();
249 
250 	/* Initialize index to next available (non-leaf) node.
251 	 * Lower numbered nodes correspond to leaves (data values).
252 	 */
253 	freenode = NUMVALS;
254 
255 	while(len > 1) {
256 		/* Take from list two btrees with least weight
257 		 * and build an interior node pointing to them.
258 		 * This forms a new tree.
259 		 */
260 		lch = list[0];	/* This one will be left child */
261 
262 		/* delete top (least) tree from the list of trees */
263 		list[0] = list[--len];
264 		adjust(list, 0, len - 1);
265 
266 		/* Take new top (least) tree. Reuse list slot later */
267 		rch = list[0];	/* This one will be right child */
268 
269 		/* Form new tree from the two least trees using
270 		 * a free node as root. Put the new tree in the list.
271 		 */
272 		frnp = &node[freenode];	/* address of next free node */
273 		list[0] = freenode++;	/* put at top for now */
274 		frnp->lchild = lch;
275 		frnp->rchild = rch;
276 		frnp->weight = node[lch].weight + node[rch].weight;
277 		frnp->tdepth = 1 + maxchar(node[lch].tdepth, node[rch].tdepth);
278 		/* reheap list  to get least tree at top*/
279 		adjust(list, 0, len - 1);
280 	}
281 	dctreehd = list[0];	/*head of final tree */
282 }
283 /*  */
284 char
maxchar(a,b)285 maxchar(a, b)
286 char a, b;
287 {
288 	return (a > b ? a : b);
289 }
290 /* Initialize all nodes to single element binary trees
291  * with zero weight and depth.
292  */
293 
init_tree()294 init_tree()
295 {
296 	int i;
297 
298 	for(i = 0; i < NUMNODES; ++i) {
299 		node[i].weight = 0;
300 		node[i].tdepth = 0;
301 		node[i].lchild = NOCHILD;
302 		node[i].rchild = NOCHILD;
303 	}
304 }
305 
init_enc()306 init_enc()
307 {
308 	int i;
309 
310 	/* Initialize encoding table */
311 	for(i = 0; i < NUMVALS; ++i) {
312 		codelen[i] = 0;
313 	}
314 }
315 /*  */
316 /* Recursive routine to walk the indicated subtree and level
317  * and maintain the current path code in bstree. When a leaf
318  * is found the entire code string and length are put into
319  * the encoding table entry for the leaf's data value.
320  *
321  * Returns ERROR if codes are too long.
322  */
323 
324 int		/* returns ERROR or NULL */
buildenc(level,root)325 buildenc(level, root)
326 int level;/* level of tree being examined, from zero */
327 int root; /* root of subtree is also data value if leaf */
328 {
329 	int l, r;
330 
331 	l = node[root].lchild;
332 	r = node[root].rchild;
333 #ifdef DEBUG
334 	if (debug) printf("level=%d,root=%d,l=%d,r=%d,tcode=%04x\n",level,root,l,r,tcode);
335 #endif
336 	if( l == NOCHILD && r == NOCHILD) {
337 		/* Leaf. Previous path determines bit string
338 		 * code of length level (bits 0 to level - 1).
339 		 * Ensures unused code bits are zero.
340 		 */
341 		codelen[root] = level;
342 #ifdef TOPS20
343 		code[root] = tcode & (65535 >> (16 - level));
344 #else
345 		code[root] = tcode & ((unsigned int)(~0) >> ((sizeof(int) * 8) - level));
346 #endif
347 #ifdef DEBUG
348 		if (debug) printf("  codelen[%d]=%d,code[%d]=%02x\n",root,codelen[root],root,code[root]);
349 #endif
350 		return ((level > 16) ? ERROR : NULL);
351 	} else {
352 		if( l != NOCHILD) {
353 			/* Clear path bit and continue deeper */
354 			tcode &= ~(1 << level);
355 			/* NOTE RECURSION */
356 			if(buildenc(level + 1, l) == ERROR)
357 				return (ERROR);
358 		}
359 		if(r != NOCHILD) {
360 			/* Set path bit and continue deeper */
361 			tcode |= 1 << level;
362 			/* NOTE RECURSION */
363 			if(buildenc(level + 1, r) == ERROR)
364 				return (ERROR);
365 		}
366 	}
367 	return (NULL);	/* if we got here we're ok so far */
368 }
369 /*  */
370 /* Write out the header of the compressed file */
371 
372 #ifdef TOPS20
wrt_head(ob,req)373 wrt_head(ob, req)
374 FILE *ob;
375 struct filename *req;
376 #else
377 wrt_head(ob, infile)
378 FILE *ob;
379 char *infile;	/* input file name (w/ or w/o drive) */
380 #endif
381 {
382 #ifdef TOPS20
383 	char *q;
384 	char fnbuf[100], lfname[40], lftype[40];
385 #endif
386 	int i, k, l, r;
387 	int numnodes;		/* nbr of nodes in simplified tree */
388 
389 	putwe(RECOGNIZE, ob);	/* identifies as compressed */
390 	putwe(crc, ob);		/* unsigned sum of original data */
391 #ifdef TOPS20
392 	strcpy (lfname, req->nm);
393 	strcpy (lftype, req->typ);
394 	lower (lfname);
395 	lower (lftype);
396 	sprintf (fnbuf, "%.8s%s%.3s", lfname,
397 		 (req->typ && *req->typ) ? "." : "", lftype);
398 	q = fnbuf;
399 	do {
400 		putce(*q, ob);
401 	} while(*(q++) != '\0');
402 #else
403 	/* Record the original file name w/o drive */
404 	if(*(infile + 1) == ':')
405 		infile += 2;	/* skip drive */
406 
407 	do {
408 		putce(*infile, ob);
409 	} while(*(infile++) != '\0');
410 #endif
411 
412 	/* Write out a simplified decoding tree. Only the interior
413 	 * nodes are written. When a child is a leaf index
414 	 * (representing a data value) it is recoded as
415 	 * -(index + 1) to distinguish it from interior indexes
416 	 * which are recoded as positive indexes in the new tree.
417 	 * Note that this tree will be empty for an empty file.
418 	 */
419 
420 	numnodes = dctreehd < NUMVALS ? 0 : dctreehd - (NUMVALS -1);
421 	putwe(numnodes, ob);
422 
423 	for(k = 0, i = dctreehd; k < numnodes; ++k, --i) {
424 		l = node[i].lchild;
425 		r = node[i].rchild;
426 		l = l < NUMVALS ? -(l + 1) : dctreehd - l;
427 		r = r < NUMVALS ? -(r + 1) : dctreehd - r;
428 		putwe(l, ob);	/* left child */
429 		putwe(r, ob);	/* right child */
430 	}
431 }
432 /*  */
433 /* Get an encoded byte or EOF. Reads from specified stream AS NEEDED.
434  *
435  * There are two unsynchronized bit-byte relationships here.
436  * The input stream bytes are converted to bit strings of
437  * various lengths via the static variables named c...
438  * These bit strings are concatenated without padding to
439  * become the stream of encoded result bytes, which this
440  * function returns one at a time. The EOF (end of file) is
441  * converted to SPEOF for convenience and encoded like any
442  * other input value. True EOF is returned after that.
443  *
444  * The original gethuff() called a seperate function,
445  * getbit(), but that more readable version was too slow.
446  */
447 
448 int				/*  Returns byte values except for EOF */
gethuff(ib)449 gethuff(ib)
450 FILE *ib;
451 {
452 	unsigned int rbyte;	/* Result byte value */
453 	char need, take;	/* numbers of bits */
454 
455 	rbyte = 0;
456 	need = 8;		/* build one byte per call */
457 
458 	/* Loop to build a byte of encoded data
459 	 * Initialization forces read the first time
460 	 */
461 
462 loop:
463 	if(cbitsrem >= need) {
464 		/* Current code fullfills our needs */
465 		if(need == 0)
466 			return (rbyte & 0xff);
467 		/* Take what we need */
468  		rbyte |= ccode << (8 - need);
469 		/* And leave the rest */
470 		ccode >>= need;
471 		cbitsrem -= need;
472 		return (rbyte & 0xff);
473 	}
474 
475 	/* We need more than current code */
476 	if(cbitsrem > 0) {
477 		/* Take what there is */
478 		rbyte |= ccode << (8 - need);
479 		need -= cbitsrem;
480 	}
481 	/* No more bits in current code string */
482 	if(curin == SPEOF) {
483 		/* The end of file token has been encoded. If
484 		 * result byte has data return it and do EOF next time
485 		 */
486 		cbitsrem = 0;
487 
488 		return ((need == 8) ? EOF : rbyte & 0xff);
489 	}
490 
491 	/* Get an input byte */
492 	if((curin = getcnr(ib)) == EOF)
493 		curin = SPEOF;	/* convenient for encoding */
494 
495 	/* Get the new byte's code */
496 	ccode = code[curin];
497 	cbitsrem = codelen[curin];
498 
499 	goto loop;
500 }
501