1 /*
2  * Changes by Gunnar Ritter, Freiburg i. Br., Germany, May 2003.
3  *
4  * Derived from unzip 5.40.
5  *
6  * Sccsid @(#)unshrink.c	1.4 (gritter) 6/18/04
7  */
8 /*---------------------------------------------------------------------------
9 
10   unshrink.c                     version 1.21                     23 Nov 95
11 
12 
13        NOTE:  This code may or may not infringe on the so-called "Welch
14        patent" owned by Unisys.  (From reading the patent, it appears
15        that a pure LZW decompressor is *not* covered, but this claim has
16        not been tested in court, and Unisys is reported to believe other-
17        wise.)  It is therefore the responsibility of the user to acquire
18        whatever license(s) may be required for legal use of this code.
19 
20        THE INFO-ZIP GROUP DISCLAIMS ALL LIABILITY FOR USE OF THIS CODE
21        IN VIOLATION OF APPLICABLE PATENT LAW.
22 
23 
24   Shrinking is basically a dynamic LZW algorithm with allowed code sizes of
25   up to 13 bits; in addition, there is provision for partial clearing of
26   leaf nodes.  PKWARE uses the special code 256 (decimal) to indicate a
27   change in code size or a partial clear of the code tree:  256,1 for the
28   former and 256,2 for the latter.  [Note that partial clearing can "orphan"
29   nodes:  the parent-to-be can be cleared before its new child is added,
30   but the child is added anyway (as an orphan, as though the parent still
31   existed).  When the tree fills up to the point where the parent node is
32   reused, the orphan is effectively "adopted."  Versions prior to 1.05 were
33   affected more due to greater use of pointers (to children and siblings
34   as well as parents).]
35 
36   This replacement version of unshrink.c was written from scratch.  It is
37   based only on the algorithms described in Mark Nelson's _The Data Compres-
38   sion Book_ and in Terry Welch's original paper in the June 1984 issue of
39   IEEE _Computer_; no existing source code, including any in Nelson's book,
40   was used.
41 
42   Memory requirements have been reduced in this version and are now no more
43   than the original Sam Smith code.  This is still larger than any of the
44   other algorithms:  at a minimum, 8K+8K+16K (stack+values+parents) assuming
45   16-bit short ints, and this does not even include the output buffer (the
46   other algorithms leave the uncompressed data in the work area, typically
47   called slide[]).  For machines with a 64KB data space this is a problem,
48   particularly when text conversion is required and line endings have more
49   than one character.  UnZip's solution is to use two roughly equal halves
50   of outbuf for the ASCII conversion in such a case; the "unshrink" argument
51   to flush() signals that this is the case.
52 
53   For large-memory machines, a second outbuf is allocated for translations,
54   but only if unshrinking and only if translations are required.
55 
56               | binary mode  |        text mode
57     ---------------------------------------------------
58     big mem   |  big outbuf  | big outbuf + big outbuf2  <- malloc'd here
59     small mem | small outbuf | half + half small outbuf
60 
61   Copyright 1994, 1995 Greg Roelofs.  See the accompanying file "COPYING"
62   in UnZip 5.20 (or later) source or binary distributions.
63 
64   From "COPYING":
65 
66    The following copyright applies to the new version of unshrink.c,
67    distributed with UnZip version 5.2 and later:
68 
69      * Copyright (c) 1994 Greg Roelofs.
70      * Permission is granted to any individual/institution/corporate
71      * entity to use, copy, redistribute or modify this software for
72      * any purpose whatsoever, subject to the conditions noted in the
73      * Frequently Asked Questions section below, plus one additional
74      * condition:  namely, that my name not be removed from the source
75      * code.  (Other names may, of course, be added as modifications
76      * are made.)  Corporate legal staff (like at IBM :-) ) who have
77      * problems understanding this can contact me through Zip-Bugs...
78 
79 
80    Q. Can I use the source code of Zip and UnZip in my commercial
81       application?
82 
83    A. Yes, so long as you include in your product an acknowledgment; a
84       pointer to the original, free compression sources; and a statement
85       making it clear that there are no extra or hidden charges resulting
86       from the use of our compression code in your product (see below for
87       an example).  The acknowledgment should appear in at least one piece
88       of human-readable documentation (e.g., a README file or man page),
89       although additionally putting it in the executable(s) is OK, too.
90       In other words, you are allowed to sell only your own work, not ours,
91       and we'd like a little credit.  (Note the additional restrictions
92       above on the code in unreduce.c, unshrink.c, vms.c, time_lib.c, and
93       everything in the wince and windll subdirectories.)  Contact us at
94       Zip-Bugs@lists.wku.edu if you have special requirements.  We also
95       like to hear when our code is being used, but we don't require that.
96 
97          <Product> incorporates compression code from the Info-ZIP group.
98          There are no extra charges or costs due to the use of this code,
99          and the original compression sources are freely available from
100          http://www.cdrom.com/pub/infozip/ or ftp://ftp.cdrom.com/pub/infozip/
101          on the Internet.
102 
103       If you only need compression capability, not full zipfile support,
104       you might want to look at zlib instead; it has fewer restrictions
105       on commercial use.  See http://www.cdrom.com/pub/infozip/zlib/ .
106 
107   ---------------------------------------------------------------------------*/
108 
109 #include <string.h>
110 #include <stdio.h>
111 
112 #include "cpio.h"
113 #include "unzip.h"
114 
115 static void	partial_clear(struct globals *);
116 
117 #define	trace()
118 
119 /* HSIZE is defined as 2^13 (8192) in unzip.h */
120 #define	BOGUSCODE  256
121 #define	FLAG_BITS  parent        /* upper bits of parent[] used as flag bits */
122 #define	CODE_MASK  (HSIZE - 1)   /* 0x1fff (lower bits are parent's index) */
123 #define	FREE_CODE  HSIZE         /* 0x2000 (code is unused or was cleared) */
124 #define	HAS_CHILD  (HSIZE << 1)  /* 0x4000 (code has a child--do not clear) */
125 
126 #define parent G.area.shrink.Parent
127 #define Value  G.area.shrink.value /* "value" conflicts with Pyramid ioctl.h */
128 #define stack  G.area.shrink.Stack
129 
130 /***********************/
131 /* Function unshrink() */
132 /***********************/
133 
134 int
zipunshrink(struct file * f,const char * tgt,int tfd,int doswap,uint32_t * crc)135 zipunshrink(struct file *f, const char *tgt, int tfd, int doswap, uint32_t *crc)
136 {
137     struct globals G;
138     int offset = (HSIZE - 1);
139     uint8_t *stacktop = stack + offset;
140     register uint8_t *newstr;
141     int codesize=9, len, KwKwK;
142     int16_t code, oldcode, freecode, curcode;
143     int16_t lastfreecode;
144     unsigned int outbufsiz;
145 
146 /*---------------------------------------------------------------------------
147     Initialize various variables.
148   ---------------------------------------------------------------------------*/
149 
150     memset(&G, 0, sizeof G);
151     G.tgt = tgt;
152     G.tfd = tfd;
153     G.doswap = doswap;
154     G.crc = crc;
155     G.zsize = G.uzsize = f->f_csize;
156     lastfreecode = BOGUSCODE;
157 
158     for (code = 0;  code < BOGUSCODE;  ++code) {
159         Value[code] = (uint8_t)code;
160         parent[code] = BOGUSCODE;
161     }
162     for (code = BOGUSCODE+1;  code < HSIZE;  ++code)
163         parent[code] = FREE_CODE;
164 
165     outbufsiz = OUTBUFSIZ;
166     G.outptr = G.outbuf;
167     G.outcnt = 0L;
168 
169 /*---------------------------------------------------------------------------
170     Get and output first code, then loop over remaining ones.
171   ---------------------------------------------------------------------------*/
172 
173     READBITS(codesize, oldcode)
174     if (!G.zipeof) {
175         *G.outptr++ = (uint8_t)oldcode;
176         ++G.outcnt;
177     }
178 
179     do {
180         READBITS(codesize, code)
181         if (G.zipeof)
182             break;
183         if (code == BOGUSCODE) {   /* possible to have consecutive escapes? */
184             READBITS(codesize, code)
185             if (code == 1) {
186                 ++codesize;
187                 Trace((stderr, " (codesize now %d bits)\n", codesize));
188             } else if (code == 2) {
189                 Trace((stderr, " (partial clear code)\n"));
190                 partial_clear(&G);   /* clear leafs (nodes with no children) */
191                 Trace((stderr, " (done with partial clear)\n"));
192                 lastfreecode = BOGUSCODE;  /* reset start of free-node search */
193             }
194             continue;
195         }
196 
197     /*-----------------------------------------------------------------------
198         Translate code:  traverse tree from leaf back to root.
199       -----------------------------------------------------------------------*/
200 
201         newstr = stacktop;
202         curcode = code;
203 
204         if (parent[curcode] == FREE_CODE) {
205             /* or (FLAG_BITS[curcode] & FREE_CODE)? */
206             KwKwK = TRUE;
207             Trace((stderr, " (found a KwKwK code %d; oldcode = %d)\n", code,
208               oldcode));
209             --newstr;   /* last character will be same as first character */
210             curcode = oldcode;
211         } else
212             KwKwK = FALSE;
213 
214         do {
215             *newstr-- = Value[curcode];
216             curcode = (int16_t)(parent[curcode] & CODE_MASK);
217         } while (curcode != BOGUSCODE);
218 
219         len = (int)(stacktop - newstr++);
220         if (KwKwK)
221             *stacktop = *newstr;
222 
223     /*-----------------------------------------------------------------------
224         Write expanded string in reverse order to output buffer.
225       -----------------------------------------------------------------------*/
226 
227         Trace((stderr, "code %4d; oldcode %4d; char %3d (%c); string [", code,
228           oldcode, (int)(*newstr), (*newstr<32 || *newstr>=127)? ' ':*newstr));
229 
230         {
231             register uint8_t *p;
232 
233             for (p = newstr;  p < newstr+len;  ++p) {
234                 *G.outptr++ = *p;
235                 if (++G.outcnt == outbufsiz) {
236 		    flush(&G, G.outbuf, G.outcnt);
237                     G.outptr = G.outbuf;
238                     G.outcnt = 0L;
239                 }
240             }
241         }
242 
243     /*-----------------------------------------------------------------------
244         Add new leaf (first character of newstr) to tree as child of oldcode.
245       -----------------------------------------------------------------------*/
246 
247         /* search for freecode */
248         freecode = (int16_t)(lastfreecode + 1);
249         /* add if-test before loop for speed? */
250         while (parent[freecode] != FREE_CODE)
251             ++freecode;
252         lastfreecode = freecode;
253         Trace((stderr, "]; newcode %d\n", freecode));
254 
255         Value[freecode] = *newstr;
256         parent[freecode] = oldcode;
257         oldcode = code;
258 
259     } while (!G.zipeof);
260 
261 /*---------------------------------------------------------------------------
262     Flush any remaining data and return to sender...
263   ---------------------------------------------------------------------------*/
264 
265     if (G.outcnt > 0L)
266 	    flush(&G, G.outbuf, G.outcnt);
267 
268     return G.status;
269 
270 } /* end function unshrink() */
271 
272 
273 
274 
275 
276 /****************************/
277 /* Function partial_clear() */      /* no longer recursive... */
278 /****************************/
279 
280 static void
partial_clear(struct globals * Gp)281 partial_clear(struct globals *Gp)
282 {
283 #define	G	(*Gp)
284     register int16_t code;
285 
286     /* clear all nodes which have no children (i.e., leaf nodes only) */
287 
288     /* first loop:  mark each parent as such */
289     for (code = BOGUSCODE+1;  code < HSIZE;  ++code) {
290         register int16_t cparent = (int16_t)(parent[code] & CODE_MASK);
291 
292         if (cparent > BOGUSCODE && cparent != FREE_CODE)
293             FLAG_BITS[cparent] |= HAS_CHILD;   /* set parent's child-bit */
294     }
295 
296     /* second loop:  clear all nodes *not* marked as parents; reset flag bits */
297     for (code = BOGUSCODE+1;  code < HSIZE;  ++code) {
298         if (FLAG_BITS[code] & HAS_CHILD)    /* just clear child-bit */
299             FLAG_BITS[code] &= ~HAS_CHILD;
300         else {                              /* leaf:  lose it */
301             Trace((stderr, "%d\n", code));
302             parent[code] = FREE_CODE;
303         }
304     }
305 
306     return;
307 }
308