1 // ///////////////////////////////////////
2 // smallz4cat.c
3 // Copyright (c) 2016-2018 Stephan Brumme. All rights reserved.
4 // see https://create.stephan-brumme.com/smallz4/
5 //
6 // "MIT License":
7 // Permission is hereby granted, free of charge, to any person obtaining a copy
8 // of this software and associated documentation files (the "Software"),
9 // to deal in the Software without restriction, including without limitation
10 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 // and/or sell copies of the Software, and to permit persons to whom the Software
12 // is furnished to do so, subject to the following conditions:
13 //
14 // The above copyright notice and this permission notice shall be included
15 // in all copies or substantial portions of the Software.
16 //
17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
18 // INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
19 // PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20 // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24 // This program is a shorter, more readable, albeit slower re-implementation of lz4cat ( https://github.com/Cyan4973/xxHash )
25
26 // compile: gcc smallz4cat.c -O3 -o smallz4cat -Wall -pedantic -std=c99 -s
27 // The static 8k binary was compiled using Clang and dietlibc (see https://www.fefe.de/dietlibc/ )
28
29 // Limitations:
30 // - skippable frames and legacy frames are not implemented (and most likely never will)
31 // - checksums are not verified (see https://create.stephan-brumme.com/xxhash/ for a simple implementation)
32
33 // Replace getByteFromIn() and sendToOut() by your own code if you need in-memory LZ4 decompression.
34 // Corrupted data causes a call to error().
35
36 #define READ_BUFFER_SIZE 4*1024
37
38 #include <stdint.h> // uint32_t
39 #include <stdio.h> // stdin/stdout/stderr, fopen, ...
40 #include <stdlib.h> // exit()
41 #include <string.h> // memcpy
42
43 #include "lz4dec.h"
44
45 // error handler
error(const char * msg)46 void error(const char* msg) {
47 // smaller static binary than fprintf(stderr, "ERROR: %s\n", msg);
48 fputs("ERROR: ", stderr);
49 fputs(msg, stderr);
50 fputs("\n", stderr);
51 exit( 1);
52 }
53
getByte(FILE * in)54 static unsigned char getByte( FILE *in ) {
55 // modify buffer size as you like ... for most use cases, bigger buffer aren't faster anymore - and even reducing to 1 byte works !
56 static unsigned char readBuffer[READ_BUFFER_SIZE];
57 static size_t pos = 0;
58 static size_t available = 0;
59
60 // refill buffer
61 if (pos == available)
62 {
63 pos = 0;
64 available = fread(readBuffer, 1, READ_BUFFER_SIZE, in);
65 if (available == 0)
66 error("out of data");
67 }
68
69 // return a byte
70 return readBuffer[pos++];
71 }
72
sendBytes(const unsigned char * data,unsigned int numBytes,FILE * out)73 static void sendBytes(const unsigned char* data, unsigned int numBytes, FILE *out) {
74 if (data != NULL && numBytes > 0)
75 fwrite(data, 1, numBytes, out);
76 }
77
unlz4(const char * ifn,const char * ofn,const char * dictionary)78 void unlz4( const char *ifn, const char *ofn, const char* dictionary) {
79
80 FILE *inpFp = fopen( ifn, "rb" );
81 FILE *outFp = fopen( ofn, "wb" );
82
83 // signature
84 unsigned char signature1 = getByte( inpFp );
85 unsigned char signature2 = getByte( inpFp );
86 unsigned char signature3 = getByte( inpFp );
87 unsigned char signature4 = getByte( inpFp );
88 uint32_t signature = (signature4 << 24) | (signature3 << 16) | (signature2 << 8) | signature1;
89 int isModern = (signature == 0x184D2204);
90 int isLegacy = (signature == 0x184C2102);
91 if (!isModern && !isLegacy)
92 error("invalid signature");
93
94 unsigned char hasBlockChecksum = 0;
95 unsigned char hasContentSize = 0;
96 unsigned char hasContentChecksum = 0;
97 unsigned char hasDictionaryID = 0;
98 if (isModern) {
99 // flags
100 unsigned char flags = getByte( inpFp );
101 hasBlockChecksum = flags & 16;
102 hasContentSize = flags & 8;
103 hasContentChecksum = flags & 4;
104 hasDictionaryID = flags & 1;
105
106 // only version 1 file format
107 unsigned char version = flags >> 6;
108 if (version != 1)
109 error("only LZ4 file format version 1 supported");
110
111 // ignore blocksize
112 getByte( inpFp );
113
114 if (hasContentSize)
115 {
116 // ignore, skip 8 bytes
117 getByte( inpFp ); getByte( inpFp ); getByte( inpFp ); getByte( inpFp );
118 getByte( inpFp ); getByte( inpFp ); getByte( inpFp ); getByte( inpFp );
119 }
120 if (hasDictionaryID)
121 {
122 // ignore, skip 4 bytes
123 getByte( inpFp ); getByte( inpFp ); getByte( inpFp ); getByte( inpFp );
124 }
125
126 // ignore header checksum (xxhash32 of everything up this point & 0xFF)
127 getByte( inpFp );
128 }
129
130 // don't lower this value, backreferences can be 64kb far away
131 #define HISTORY_SIZE 64*1024
132 // contains the latest decoded data
133 unsigned char history[HISTORY_SIZE];
134 // next free position in history[]
135 unsigned int pos = 0;
136
137 // dictionary compression is a recently introduced feature, just move its contents to the buffer
138 if (dictionary != NULL)
139 {
140 // open dictionary
141 FILE* dict = fopen(dictionary, "rb");
142 if (!dict)
143 error("cannot open dictionary");
144
145 // get dictionary's filesize
146 fseek(dict, 0, SEEK_END);
147 size_t dictSize = ftell(dict);
148 // only the last 64k are relevant
149 size_t relevant = dictSize < 65536 ? 0 : dictSize - 65536;
150 fseek(dict, (long)relevant, SEEK_SET);
151 if (dictSize > 65536)
152 dictSize = 65536;
153 // read it and store it at the end of the buffer
154 fread(history + HISTORY_SIZE - dictSize, 1, dictSize, dict);
155 fclose(dict);
156 }
157
158 // parse all blocks until blockSize == 0
159 while (1)
160 {
161 // block size
162 uint32_t blockSize = getByte( inpFp );
163 blockSize |= (uint32_t)getByte( inpFp ) << 8;
164 blockSize |= (uint32_t)getByte( inpFp ) << 16;
165 blockSize |= (uint32_t)getByte( inpFp ) << 24;
166
167 // highest bit set ?
168 unsigned char isCompressed = isLegacy || (blockSize & 0x80000000) == 0;
169 if (isModern)
170 blockSize &= 0x7FFFFFFF;
171
172 // stop after last block
173 if (blockSize == 0)
174 break;
175
176 if (isCompressed)
177 {
178 // decompress block
179 uint32_t blockOffset = 0;
180 while (blockOffset < blockSize)
181 {
182 // get a token
183 unsigned char token = getByte( inpFp );
184
185 blockOffset++;
186
187 // determine number of literals
188 uint32_t numLiterals = (token >> 4) & 0x0F;
189 if (numLiterals == 15)
190 {
191 // number of literals length encoded in more than 1 byte
192 unsigned char current;
193 do
194 {
195 current = getByte( inpFp );
196 numLiterals += current;
197 blockOffset++;
198 } while (current == 255);
199 }
200
201 blockOffset += numLiterals;
202 // copy all those literals
203 while (numLiterals-- > 0)
204 {
205 history[pos++] = getByte( inpFp );
206
207 // flush output buffer
208 if (pos == HISTORY_SIZE)
209 {
210 sendBytes(history, HISTORY_SIZE, outFp);
211 pos = 0;
212 }
213 }
214
215 // last token has only literals
216 if (blockOffset == blockSize)
217 break;
218
219 // match distance is encoded by two bytes (little endian)
220 blockOffset += 2;
221 uint32_t delta = getByte( inpFp );
222 delta |= (uint32_t)getByte( inpFp ) << 8;
223 // zero isn't allowed
224 if (delta == 0)
225 error("invalid offset");
226
227 // match length (must be >= 4, therefore length is stored minus 4)
228 uint32_t matchLength = 4 + (token & 0x0F);
229 if (matchLength == 4 + 0x0F)
230 {
231 unsigned char current;
232 do // match length encoded in more than 1 byte
233 {
234 current = getByte( inpFp );
235 matchLength += current;
236 blockOffset++;
237 } while (current == 255);
238 }
239
240 // copy match
241 uint32_t reference = (pos >= delta) ? pos - delta : HISTORY_SIZE + pos - delta;
242 if (pos + matchLength < HISTORY_SIZE && reference + matchLength < HISTORY_SIZE)
243 {
244 // fast copy
245 if (pos >= reference + matchLength || reference >= pos + matchLength)
246 {
247 // non-overlapping
248 memcpy(history + pos, history + reference, matchLength);
249 pos += matchLength;
250 }
251 else
252 {
253 // overlapping
254 while (matchLength-- > 0)
255 history[pos++] = history[reference++];
256 }
257 }
258 else
259 {
260 // slower copy, have to take care of buffer limits
261 while (matchLength-- > 0)
262 {
263 // copy single byte
264 history[pos++] = history[reference++];
265
266 // cannot write anymore ? => wrap around
267 if (pos == HISTORY_SIZE)
268 {
269 // flush output buffer
270 sendBytes(history, HISTORY_SIZE, outFp);
271 pos = 0;
272 }
273 // cannot read anymore ? => wrap around
274 if (reference == HISTORY_SIZE)
275 reference = 0;
276 }
277 }
278 }
279
280 // all legacy blocks must be completely filled - except for the last one
281 if (isLegacy && blockSize < 8*1024*1024)
282 break;
283 }
284 else
285 {
286 // copy uncompressed data and add to history, too (if next block is compressed and some matches refer to this block)
287 while (blockSize-- > 0)
288 {
289 // copy a byte ...
290 history[pos++] = getByte( inpFp );
291 // ... until buffer is full => send to output
292 if (pos == HISTORY_SIZE)
293 {
294 sendBytes(history, HISTORY_SIZE, outFp);
295 pos = 0;
296 }
297 }
298 }
299
300 if (hasBlockChecksum)
301 {
302 // ignore checksum, skip 4 bytes
303 getByte( inpFp ); getByte( inpFp ); getByte( inpFp ); getByte( inpFp );
304 }
305 }
306
307 if (hasContentChecksum)
308 {
309 // ignore checksum, skip 4 bytes
310 getByte( inpFp ); getByte( inpFp ); getByte( inpFp ); getByte( inpFp );
311 }
312
313 // flush output buffer
314 sendBytes(history, pos, outFp);
315 }
316