1 /*=============================================================================
2 runlength.c
3 ===============================================================================
4 "Packbits" run-length encoding and variants.
5
6 Copyright (C) 2015 by Akira Urushibata (afu@wta.att.ne.jp).
7
8 Permission to use, copy, modify, and distribute this software and its
9 documentation for any purpose and without fee is hereby granted, provided
10 that the above copyright notice appear in all copies and that both that
11 copyright notice and this permission notice appear in supporting
12 documentation. This software is provided "as is" without express or implied
13 warranty.
14
15 Functions pm_rlenc_compressbyte() and pm_rlenc_compressword() are based
16 on algorithm originally by Robert A. Knop (rknop@mop.caltech.edu).
17
18 Those who wish to incorporate the code herein into their own programs are
19 strongly discouraged from removing the comments within borders consisting
20 of "+" marks. This is a practical consideration based on code inspection
21 and bug fixes of various programs which use run-length compression.
22
23 ===============================================================================
24
25 Packbits is a simple yet efficient simple run-length compression method.
26 It has a provision for uncompressed sequences which allows efficient
27 encoding of noisy input.
28
29 A survey of netpbm source code in 2015 found Packbits encoding in the
30 following Netpbm programs: pbmtoescp2, pbmtomacp, pnmtopalm, pnmtopclxl,
31 pnmtops, ppmtoilbm and ppmtopjxl.
32
33 Packbits is an option in the TIFF standard; pamtotiff can generate TIFF
34 images that use Packbits compression, but does so via code in the TIFF
35 library (not part of Netpbm).
36
37 Variants of Packbits are employed in pnmtosgi, pamtopdbimg, pbmtoppa
38 pbmtogo and pamtotga.
39
40 All the above programs formerly did the same compression through different
41 code. This redundancy bloated the Netpbm package and made maintenance
42 difficult. This maintenance difficulty surfaced as an issue when tests with
43 valgrind revealed multiple memory-related problems in the above programs.
44 Indeed, just determining which programs do compression by this method
45 was not simple because of differences in terminology. "Packbits" is often
46 called "run length encoding." In ppmtoilbm the name is "byterun1."
47
48 Today, all Netpbm programs that do Packbits compression with the exception
49 of pamtotiff, pbmtoppa, pbmtogo and pamtotga use the facilities in this
50 file for it.
51 =============================================================================*/
52
53 #include <string.h>
54
55 #include "pm.h"
56 #include "pm_c_util.h"
57 #include "runlength.h"
58 #include "mallocvar.h"
59
60
61
62 static const char * const errorUndefinedMode =
63 "Internal error: compression mode %u not supported";
64
65
66
67 /*-----------------------------------------------------------------------------
68 Run length encoding
69
70 In this simple run-length encoding scheme, compressed and uncompressed
71 strings follow a single index or "flag" byte N. There are several
72 variations, differing in the format of the flag byte, maximum string
73 length and element size (byte or word).
74
75 In the most widely used version, Packbits, the meaning of the flag byte
76 N is defined as follows:
77
78 0-127 means the next N+1 bytes are uncompressed.
79 129-255 means the next byte is to be repeated 257-N times.
80 128 is not used.
81
82 The name "Packbits" is misleading: it packs bytes, not bits.
83 -----------------------------------------------------------------------------*/
84
85
86
87 void
pm_rlenc_compressbyte(const unsigned char * const inbuf,unsigned char * const outbuf,enum pm_RleMode const mode,size_t const inSize,size_t * const outputSizeP)88 pm_rlenc_compressbyte(const unsigned char * const inbuf,
89 unsigned char * const outbuf,
90 enum pm_RleMode const mode,
91 size_t const inSize,
92 size_t * const outputSizeP) {
93 /*-----------------------------------------------------------------------------
94 Compress the contents of input buffer 'inbuf' with Packbits encoding into
95 output buffer 'outbuf'. 'inSize' is the number of bytes of data in 'inbuf'.
96 Return as *outputSizeP the number of bytes we put in 'outbuf'.
97
98 'outbuf' should be allocated with pm_rlenc_allocoutbuf().
99
100 Always encode 3-byte repeat sequences.
101
102 Encode 2-byte repeat sequences only when they are at the start of the block.
103 This ensures that the output is never unnecessarily bloated.
104
105 Original algorithm by Robert A. Knop (rknop@mop.caltech.edu)
106 -----------------------------------------------------------------------------*/
107 unsigned int const maxRun = 128;
108
109 size_t inCurs, outCurs;
110
111 int packBase;
112 int packSign;
113
114 switch (mode) {
115 case PM_RLE_PACKBITS:
116 packBase = 257 ; packSign = -1; break;
117 case PM_RLE_PALMPDB:
118 packBase = 127 ; packSign = +1; break;
119 default:
120 pm_error(errorUndefinedMode, mode);
121 }
122
123 for (inCurs = 0, outCurs = 0; inCurs < inSize; ) {
124 if ((inCurs < inSize - 1) && (inbuf[inCurs] == inbuf[inCurs+1])) {
125 /*Begin replicate run*/
126 size_t const hold = inCurs;
127 size_t count;
128 for (count = 0;
129 inCurs < inSize &&
130 inbuf[inCurs] == inbuf[hold] &&
131 count < maxRun;
132 ++inCurs, ++count)
133 ;
134 outbuf[outCurs++] = (unsigned char) (packBase + packSign * count);
135 outbuf[outCurs++] = inbuf[hold];
136 } else {
137 /*Do a non-run*/
138 size_t const hold = outCurs;
139 size_t count;
140 ++outCurs;
141 count = 0;
142 while(((inCurs + 2 >= inSize) && (inCurs < inSize) ) ||
143 ((inCurs + 2 < inSize) &&
144 ((inbuf[inCurs] != inbuf[inCurs+1] )
145 || (inbuf[inCurs] != inbuf[inCurs+2] ) ) ) ) {
146 outbuf[outCurs++] = inbuf[inCurs++];
147 if (++count >= 128)
148 break;
149 }
150 outbuf[hold] = (unsigned char)(count - 1);
151 }
152 }
153 *outputSizeP = outCurs;
154 }
155
156
157
158 static void
setFlagElement(void * const outP,enum pm_RleMode const mode,bool const isRepeatRun,size_t const count)159 setFlagElement(void * const outP,
160 enum pm_RleMode const mode,
161 bool const isRepeatRun,
162 size_t const count) {
163 /*---------------------------------------------------------------------------
164 Write the flag byte or word at specified point in the output buffer.
165 -----------------------------------------------------------------------------*/
166 switch (mode) {
167 case PM_RLE_SGI16:
168 * (uint16_t *) outP = (isRepeatRun ? 0x00 : 0x80 ) | count;
169 break;
170 case PM_RLE_PALM16:
171 * (unsigned char *) outP = isRepeatRun ?
172 (unsigned char)(257 - count) : (unsigned char) (count - 1);
173 break;
174 default:
175 pm_error(errorUndefinedMode, mode);
176 }
177 }
178
179
180
181 void
pm_rlenc_compressword(const uint16_t * const inbuf,unsigned char * const outbuf,enum pm_RleMode const mode,size_t const inSize,size_t * const outputSizeP)182 pm_rlenc_compressword(const uint16_t * const inbuf,
183 unsigned char * const outbuf,
184 enum pm_RleMode const mode,
185 size_t const inSize,
186 size_t * const outputSizeP) {
187 /*---------------------------------------------------------------------------
188 Similar to pm_rlenc_compressbyte(), but this works with two-byte elements.
189 The difference between SGI16 and PALM16 is the size of the flag.
190 SGI16 : 16 bits; PALM16 : 8 bits
191
192 'inSize' is a number of words, but *outputSizeP is a number of bytes.
193 -----------------------------------------------------------------------------*/
194 size_t inCurs, outCurs;
195 size_t flagSz;
196 unsigned int maxRunSz;
197
198 switch (mode) {
199 case PM_RLE_SGI16:
200 flagSz = 2;
201 maxRunSz = 127;
202 break;
203 case PM_RLE_PALM16:
204 flagSz = 1;
205 maxRunSz = 128;
206 break;
207 default:
208 pm_error(errorUndefinedMode, mode);
209 }
210
211 for (inCurs = 0, outCurs = 0; inCurs < inSize; ) {
212 if ((inCurs < inSize - 1) && (inbuf[inCurs] == inbuf[inCurs+1])) {
213 uint16_t const runValue = inbuf[inCurs];
214 size_t count;
215 /* Do a run of 'runValue' values */
216 for (count = 0;
217 count < maxRunSz &&
218 inCurs < inSize &&
219 inbuf[inCurs] == runValue;
220 ++inCurs, ++count)
221 ;
222 setFlagElement(&outbuf[outCurs], mode, TRUE, count);
223 outCurs += flagSz;
224 * (uint16_t *) &outbuf[outCurs] = runValue;
225 outCurs += 2;
226 } else {
227 /*Do a non run*/
228 size_t const nonrunStart = inCurs;
229 size_t count;
230 count = 0;
231 while (count < maxRunSz &&
232 ((inCurs + 2 >= inSize && inCurs < inSize) ||
233 (inCurs + 2 < inSize &&
234 (inbuf[inCurs] != inbuf[inCurs+1]
235 || inbuf[inCurs] != inbuf[inCurs+2])))) {
236 ++inCurs;
237 ++count;
238 }
239 setFlagElement(&outbuf[outCurs], mode, FALSE, count);
240 outCurs += flagSz;
241 memcpy(&outbuf[outCurs], &inbuf[nonrunStart], count * 2);
242 outCurs += count * 2;
243 }
244 }
245
246 if (mode == PM_RLE_SGI16) {
247 * (uint16_t *) &outbuf[outCurs] = 0; /* terminator */
248 outCurs += 2;
249 }
250
251 *outputSizeP = outCurs;
252 }
253
254
255
256 /*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
257 Worst case output size
258
259 The term "worst-case output size" can mean one of two things depending on
260 whether the encoder is efficient or not.
261
262 Sub-optimal encoder: the output size can be up to twice the input size.
263 This happens (or one may rather say "is achieved by a determined encoder")
264 when input is split into one-byte blocks.
265
266 Efficient encoder: the output is larger than the input by the minimum
267 number of flag bytes. The worst case happens when there are no repeat
268 sequences in the input.
269
270 The encoder in this file is an efficient one.
271
272 The key to an efficient encoder is correct handling of short, inefficient
273 sequences. The following algorithm (not actually used in this file)
274 describes the idea.
275
276 A run is a maximal set of two or more consecutive identical values in the
277 input. A nonrun is a maximal set of values in which every value is
278 different from its neighbors.
279
280 A compressed block is one that encodes a sequence of identical values
281 (which could be a run or just part of a very long run) as a value and
282 a count. An uncompressed block is one that encodes a sequence of any
283 values by listing the values individually.
284
285 Start by encoding the entire input as uncompressed blocks. Seek runs that
286 can be encoded with compressed blocks, but only if doing so doesn't make
287 the output larger.
288
289 Criteria to avoid bloat:
290
291 - Overhead for a single uncompressed block: 1 byte.
292
293 - Overhead for one uncompressed block and a compressed block: 2 bytes.
294
295 - Overhead for two uncompressed blocks and a compressed block: 3 bytes.
296
297 - New blocks at the edge of any existing block add 1 byte of overhead.
298 New blocks in the middle of existing blocks add 2 bytes of overhead.
299
300 - Whenever savings are larger than the added overhead, encode the run
301 as a compressed block.
302
303 -----------------------------------------------------------------------
304
305 MS-DOS/Windows BMP format note
306
307 The compression method for BMP is a variant of packbits.
308 We could support the 8-bit version with some modifications to functions
309 in this file. Determining the worst-case output size of an efficient
310 coder is rather complicated because uncompressed blocks may not be less
311 than three bytes long and are indicated by two-byte flags.
312 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
313
314
315
316 size_t
pm_rlenc_maxbytes(size_t const inSize,enum pm_RleMode const mode)317 pm_rlenc_maxbytes(size_t const inSize, /* number of elements */
318 enum pm_RleMode const mode) {
319 /*---------------------------------------------------------------------------
320 Calculate worst case output size from input size and encoding mode.
321
322 'inSize' counts the number of elements, not bytes: input size is (2 *
323 inSize) bytes if input is an array of 16-bit words.
324
325 Return value is the maximum possible output size in bytes regardless of
326 type of input.
327
328 Abort the program if the maximum possible output size is greater than
329 INT_MAX.
330 -----------------------------------------------------------------------------*/
331 /* The upper limit could be raised above INT_MAX, but no program needs that
332 today.
333 */
334 size_t blockSize; /* Number of items in data block */
335 size_t flagSize; /* Size of flag, in bytes */
336 size_t itemSize; /* Size of item, in bytes */
337 size_t miscSize; /* Size of other elements such as term code, in bytes */
338 size_t overhead; /* Worst-case overhead, in bytes */
339 /* return value: Worst-case output size, in bytes */
340
341 switch (mode) {
342 case PM_RLE_PACKBITS:
343 case PM_RLE_PALMPDB:
344 blockSize = 128; flagSize = 1; itemSize = 1; miscSize = 0;
345 break;
346 case PM_RLE_SGI8:
347 blockSize = 127; flagSize = 1; itemSize = 1; miscSize = 0;
348 break;
349 case PM_RLE_GRAPHON:
350 case PM_RLE_PPA:
351 blockSize = 64; flagSize = 1; itemSize = 1; miscSize = 0;
352 break;
353 case PM_RLE_SGI16:
354 blockSize = 127; flagSize = 2; itemSize = 2; miscSize = 2;
355 break;
356 case PM_RLE_PALM16:
357 blockSize = 128; flagSize = 1; itemSize = 2; miscSize = 0;
358 break;
359 default:
360 pm_error(errorUndefinedMode, mode);
361 }
362
363 overhead = miscSize +
364 (inSize / blockSize + (inSize % blockSize > 0 ? 1 : 0) ) * flagSize;
365
366 if (inSize > INT_MAX / itemSize ||
367 inSize * itemSize > INT_MAX - overhead)
368 pm_error("Cannot do RLE compression. Input too large.");
369
370 return inSize * itemSize + overhead;
371 }
372
373
374
375 void
pm_rlenc_allocoutbuf(unsigned char ** const outbufP,size_t const inSize,enum pm_RleMode const mode)376 pm_rlenc_allocoutbuf(unsigned char ** const outbufP,
377 size_t const inSize, /* element count */
378 enum pm_RleMode const mode) {
379 /*---------------------------------------------------------------------------
380 Allocate an output buffer sufficient for input with inSize elements, using
381 compression mode 'mode'. Element may be byte or word, whichever 'mode'
382 implies.
383 -----------------------------------------------------------------------------*/
384 size_t const size = pm_rlenc_maxbytes(inSize, mode);
385
386 unsigned char * outbuf;
387
388 MALLOCARRAY(outbuf, size);
389 if (outbuf == NULL)
390 pm_error("Out of memory trying to get %u bytes for RLE output buffer",
391 (unsigned)size);
392
393 *outbufP = outbuf;
394 }
395
396
397
398 void
pm_rlenc_freebuf(void * const buf)399 pm_rlenc_freebuf(void * const buf) {
400 free(buf);
401 }
402
403
404