1 // ///////////////////////////////////////
2 // smallz4cat.c
3 // Copyright (c) 2016-2018 Stephan Brumme. All rights reserved.
4 // see https://create.stephan-brumme.com/smallz4/
5 //
6 // "MIT License":
7 // Permission is hereby granted, free of charge, to any person obtaining a copy
8 // of this software and associated documentation files (the "Software"),
9 // to deal in the Software without restriction, including without limitation
10 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 // and/or sell copies of the Software, and to permit persons to whom the Software
12 // is furnished to do so, subject to the following conditions:
13 //
14 // The above copyright notice and this permission notice shall be included
15 // in all copies or substantial portions of the Software.
16 //
17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
18 // INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
19 // PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20 // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 
24 // This program is a shorter, more readable, albeit slower re-implementation of lz4cat ( https://github.com/Cyan4973/xxHash )
25 
26 // compile: gcc smallz4cat.c -O3 -o smallz4cat -Wall -pedantic -std=c99 -s
27 // The static 8k binary was compiled using Clang and dietlibc (see https://www.fefe.de/dietlibc/ )
28 
29 // Limitations:
30 // - skippable frames and legacy frames are not implemented (and most likely never will)
31 // - checksums are not verified (see https://create.stephan-brumme.com/xxhash/ for a simple implementation)
32 
33 // Replace getByteFromIn() and sendToOut() by your own code if you need in-memory LZ4 decompression.
34 // Corrupted data causes a call to error().
35 
36 #define READ_BUFFER_SIZE 4*1024
37 
38 #include <stdint.h> // uint32_t
39 #include <stdio.h>  // stdin/stdout/stderr, fopen, ...
40 #include <stdlib.h> // exit()
41 #include <string.h> // memcpy
42 
43 #include "lz4dec.h"
44 
45 // error handler
error(const char * msg)46 void error(const char* msg) {
47 	// smaller static binary than fprintf(stderr, "ERROR: %s\n", msg);
48 	fputs("ERROR: ", stderr);
49 	fputs(msg,       stderr);
50 	fputs("\n",      stderr);
51 	exit(  1);
52 }
53 
getByte(FILE * in)54 static unsigned char getByte( FILE *in ) {
55 	// modify buffer size as you like ... for most use cases, bigger buffer aren't faster anymore - and even reducing to 1 byte works !
56 	static unsigned char readBuffer[READ_BUFFER_SIZE];
57 	static size_t        pos       = 0;
58 	static size_t        available = 0;
59 
60 	// refill buffer
61 	if (pos == available)
62 	{
63 		pos = 0;
64 		available = fread(readBuffer, 1, READ_BUFFER_SIZE, in);
65 		if (available == 0)
66 			error("out of data");
67 	}
68 
69 	// return a byte
70 	return readBuffer[pos++];
71 }
72 
sendBytes(const unsigned char * data,unsigned int numBytes,FILE * out)73 static void sendBytes(const unsigned char* data, unsigned int numBytes, FILE *out) {
74 	if (data != NULL && numBytes > 0)
75 		fwrite(data, 1, numBytes, out);
76 }
77 
unlz4(const char * ifn,const char * ofn,const char * dictionary)78 void unlz4( const char *ifn, const char *ofn, const char* dictionary) {
79 
80 	FILE *inpFp = fopen( ifn, "rb" );
81 	FILE *outFp = fopen( ofn, "wb" );
82 
83 	// signature
84 	unsigned char signature1 = getByte( inpFp );
85 	unsigned char signature2 = getByte( inpFp );
86 	unsigned char signature3 = getByte( inpFp );
87 	unsigned char signature4 = getByte( inpFp );
88 	uint32_t signature = (signature4 << 24) | (signature3 << 16) | (signature2 << 8) | signature1;
89 	int isModern = (signature == 0x184D2204);
90 	int isLegacy = (signature == 0x184C2102);
91 	if (!isModern && !isLegacy)
92 		error("invalid signature");
93 
94 	unsigned char hasBlockChecksum   = 0;
95 	unsigned char hasContentSize     = 0;
96 	unsigned char hasContentChecksum = 0;
97 	unsigned char hasDictionaryID    = 0;
98 	if (isModern) {
99 		// flags
100 		unsigned char flags = getByte( inpFp );
101 		hasBlockChecksum   = flags & 16;
102 		hasContentSize     = flags &  8;
103 		hasContentChecksum = flags &  4;
104 		hasDictionaryID    = flags &  1;
105 
106 		// only version 1 file format
107 		unsigned char version = flags >> 6;
108 		if (version != 1)
109 			error("only LZ4 file format version 1 supported");
110 
111 		// ignore blocksize
112 		getByte( inpFp );
113 
114 		if (hasContentSize)
115 		{
116 			// ignore, skip 8 bytes
117 			getByte( inpFp ); getByte( inpFp ); getByte( inpFp ); getByte( inpFp );
118 			getByte( inpFp ); getByte( inpFp ); getByte( inpFp ); getByte( inpFp );
119 		}
120 		if (hasDictionaryID)
121 		{
122 			// ignore, skip 4 bytes
123 			getByte( inpFp ); getByte( inpFp ); getByte( inpFp ); getByte( inpFp );
124 		}
125 
126 		// ignore header checksum (xxhash32 of everything up this point & 0xFF)
127 		getByte( inpFp );
128 	}
129 
130 	// don't lower this value, backreferences can be 64kb far away
131 #define HISTORY_SIZE 64*1024
132 	// contains the latest decoded data
133 	unsigned char history[HISTORY_SIZE];
134 	// next free position in history[]
135 	unsigned int  pos = 0;
136 
137 	// dictionary compression is a recently introduced feature, just move its contents to the buffer
138 	if (dictionary != NULL)
139 	{
140 		// open dictionary
141 		FILE* dict = fopen(dictionary, "rb");
142 		if (!dict)
143 			error("cannot open dictionary");
144 
145 		// get dictionary's filesize
146 		fseek(dict, 0, SEEK_END);
147 		size_t dictSize = ftell(dict);
148 		// only the last 64k are relevant
149 		size_t relevant = dictSize < 65536 ? 0 : dictSize - 65536;
150 		fseek(dict, (long)relevant, SEEK_SET);
151 		if (dictSize > 65536)
152 			dictSize = 65536;
153 		// read it and store it at the end of the buffer
154 		fread(history + HISTORY_SIZE - dictSize, 1, dictSize, dict);
155 		fclose(dict);
156 	}
157 
158 	// parse all blocks until blockSize == 0
159 	while (1)
160 	{
161 		// block size
162 		uint32_t blockSize = getByte( inpFp );
163 		blockSize |= (uint32_t)getByte( inpFp ) <<  8;
164 		blockSize |= (uint32_t)getByte( inpFp ) << 16;
165 		blockSize |= (uint32_t)getByte( inpFp ) << 24;
166 
167 		// highest bit set ?
168 		unsigned char isCompressed = isLegacy || (blockSize & 0x80000000) == 0;
169 		if (isModern)
170 			blockSize &= 0x7FFFFFFF;
171 
172 		// stop after last block
173 		if (blockSize == 0)
174 			break;
175 
176 		if (isCompressed)
177 		{
178 			// decompress block
179 			uint32_t blockOffset = 0;
180 			while (blockOffset < blockSize)
181 			{
182 				// get a token
183 				unsigned char token = getByte( inpFp );
184 
185 				blockOffset++;
186 
187 				// determine number of literals
188 				uint32_t numLiterals = (token >> 4) & 0x0F;
189 				if (numLiterals == 15)
190 				{
191 					// number of literals length encoded in more than 1 byte
192 					unsigned char current;
193 					do
194 					{
195 						current = getByte( inpFp );
196 						numLiterals += current;
197 						blockOffset++;
198 					} while (current == 255);
199 				}
200 
201 				blockOffset += numLiterals;
202 				// copy all those literals
203 				while (numLiterals-- > 0)
204 				{
205 					history[pos++] = getByte( inpFp );
206 
207 					// flush output buffer
208 					if (pos == HISTORY_SIZE)
209 					{
210 						sendBytes(history, HISTORY_SIZE, outFp);
211 						pos = 0;
212 					}
213 				}
214 
215 				// last token has only literals
216 				if (blockOffset == blockSize)
217 					break;
218 
219 				// match distance is encoded by two bytes (little endian)
220 				blockOffset += 2;
221 				uint32_t delta = getByte( inpFp );
222 				delta |= (uint32_t)getByte( inpFp ) << 8;
223 				// zero isn't allowed
224 				if (delta == 0)
225 					error("invalid offset");
226 
227 				// match length (must be >= 4, therefore length is stored minus 4)
228 				uint32_t matchLength = 4 + (token & 0x0F);
229 				if (matchLength == 4 + 0x0F)
230 				{
231 					unsigned char current;
232 					do // match length encoded in more than 1 byte
233 					{
234 						current = getByte( inpFp );
235 						matchLength += current;
236 						blockOffset++;
237 					} while (current == 255);
238 				}
239 
240 				// copy match
241 				uint32_t reference = (pos >= delta) ? pos - delta : HISTORY_SIZE + pos - delta;
242 				if (pos + matchLength < HISTORY_SIZE && reference + matchLength < HISTORY_SIZE)
243 				{
244 					// fast copy
245 					if (pos >= reference + matchLength || reference >= pos + matchLength)
246 					{
247 						// non-overlapping
248 						memcpy(history + pos, history + reference, matchLength);
249 						pos += matchLength;
250 					}
251 					else
252 					{
253 						// overlapping
254 						while (matchLength-- > 0)
255 							history[pos++] = history[reference++];
256 					}
257 				}
258 				else
259 				{
260 					// slower copy, have to take care of buffer limits
261 					while (matchLength-- > 0)
262 					{
263 						// copy single byte
264 						history[pos++] = history[reference++];
265 
266 						// cannot write anymore ? => wrap around
267 						if (pos == HISTORY_SIZE)
268 						{
269 							// flush output buffer
270 							sendBytes(history, HISTORY_SIZE, outFp);
271 							pos = 0;
272 						}
273 						// cannot read anymore ? => wrap around
274 						if (reference == HISTORY_SIZE)
275 							reference = 0;
276 					}
277 				}
278 			}
279 
280 			// all legacy blocks must be completely filled - except for the last one
281 			if (isLegacy && blockSize < 8*1024*1024)
282 				break;
283 		}
284 		else
285 		{
286 			// copy uncompressed data and add to history, too (if next block is compressed and some matches refer to this block)
287 			while (blockSize-- > 0)
288 			{
289 				// copy a byte ...
290 				history[pos++] = getByte( inpFp );
291 				// ... until buffer is full => send to output
292 				if (pos == HISTORY_SIZE)
293 				{
294 					sendBytes(history, HISTORY_SIZE, outFp);
295 					pos = 0;
296 				}
297 			}
298 		}
299 
300 		if (hasBlockChecksum)
301 		{
302 			// ignore checksum, skip 4 bytes
303 			getByte( inpFp ); getByte( inpFp ); getByte( inpFp ); getByte( inpFp );
304 		}
305 	}
306 
307 	if (hasContentChecksum)
308 	{
309 		// ignore checksum, skip 4 bytes
310 		getByte( inpFp ); getByte( inpFp ); getByte( inpFp ); getByte( inpFp );
311 	}
312 
313 	// flush output buffer
314 	sendBytes(history, pos, outFp);
315 }
316