1 /*=============================================================================
2                                   runlength.c
3 ===============================================================================
4   "Packbits" run-length encoding and variants.
5 
6   Copyright (C) 2015 by Akira Urushibata (afu@wta.att.ne.jp).
7 
8   Permission to use, copy, modify, and distribute this software and its
9   documentation for any purpose and without fee is hereby granted, provided
10   that the above copyright notice appear in all copies and that both that
11   copyright notice and this permission notice appear in supporting
12   documentation.  This software is provided "as is" without express or implied
13   warranty.
14 
15   Functions pm_rlenc_compressbyte() and pm_rlenc_compressword() are based
16   on algorithm originally by Robert A. Knop (rknop@mop.caltech.edu).
17 
18   Those who wish to incorporate the code herein into their own programs are
19   strongly discouraged from removing the comments within borders consisting
20   of "+" marks.  This is a practical consideration based on code inspection
21   and bug fixes of various programs which use run-length compression.
22 
23 ===============================================================================
24 
25   Packbits is a simple yet efficient simple run-length compression method.
26   It has a provision for uncompressed sequences which allows efficient
27   encoding of noisy input.
28 
29   A survey of netpbm source code in 2015 found Packbits encoding in the
30   following Netpbm programs: pbmtoescp2, pbmtomacp, pnmtopalm, pnmtopclxl,
31   pnmtops, ppmtoilbm and ppmtopjxl.
32 
33   Packbits is an option in the TIFF standard; pamtotiff can generate TIFF
34   images that use Packbits compression, but does so via code in the TIFF
35   library (not part of Netpbm).
36 
37   Variants of Packbits are employed in pnmtosgi, pamtopdbimg, pbmtoppa
38   pbmtogo and pamtotga.
39 
40   All the above programs formerly did the same compression through different
41   code.  This redundancy bloated the Netpbm package and made maintenance
42   difficult.  This maintenance difficulty surfaced as an issue when tests with
43   valgrind revealed multiple memory-related problems in the above programs.
44   Indeed, just determining which programs do compression by this method
45   was not simple because of differences in terminology.  "Packbits" is often
46   called "run length encoding."  In ppmtoilbm the name is "byterun1."
47 
48   Today, all Netpbm programs that do Packbits compression with the exception
49   of pamtotiff, pbmtoppa, pbmtogo and pamtotga use the facilities in this
50   file for it.
51 =============================================================================*/
52 
53 #include <string.h>
54 
55 #include "pm.h"
56 #include "pm_c_util.h"
57 #include "runlength.h"
58 #include "mallocvar.h"
59 
60 
61 
62 static const char * const errorUndefinedMode =
63     "Internal error: compression mode %u not supported";
64 
65 
66 
67 /*-----------------------------------------------------------------------------
68    Run length encoding
69 
70    In this simple run-length encoding scheme, compressed and uncompressed
71    strings follow a single index or "flag" byte N.  There are several
72    variations, differing in the format of the flag byte, maximum string
73    length and element size (byte or word).
74 
75    In the most widely used version, Packbits, the meaning of the flag byte
76    N is defined as follows:
77 
78     0-127 means the next N+1 bytes are uncompressed.
79     129-255 means the next byte is to be repeated 257-N times.
80     128 is not used.
81 
82    The name "Packbits" is misleading: it packs bytes, not bits.
83 -----------------------------------------------------------------------------*/
84 
85 
86 
87 void
pm_rlenc_compressbyte(const unsigned char * const inbuf,unsigned char * const outbuf,enum pm_RleMode const mode,size_t const inSize,size_t * const outputSizeP)88 pm_rlenc_compressbyte(const unsigned char * const inbuf,
89                       unsigned char       * const outbuf,
90                       enum pm_RleMode       const mode,
91                       size_t                const inSize,
92                       size_t              * const outputSizeP) {
93 /*-----------------------------------------------------------------------------
94   Compress the contents of input buffer 'inbuf' with Packbits encoding into
95   output buffer 'outbuf'.  'inSize' is the number of bytes of data in 'inbuf'.
96   Return as *outputSizeP the number of bytes we put in 'outbuf'.
97 
98   'outbuf' should be allocated with pm_rlenc_allocoutbuf().
99 
100   Always encode 3-byte repeat sequences.
101 
102   Encode 2-byte repeat sequences only when they are at the start of the block.
103   This ensures that the output is never unnecessarily bloated.
104 
105   Original algorithm by Robert A. Knop (rknop@mop.caltech.edu)
106 -----------------------------------------------------------------------------*/
107     unsigned int const maxRun = 128;
108 
109     size_t inCurs, outCurs;
110 
111     int packBase;
112     int packSign;
113 
114     switch (mode) {
115     case PM_RLE_PACKBITS:
116         packBase = 257 ; packSign = -1; break;
117     case PM_RLE_PALMPDB:
118         packBase = 127 ; packSign = +1; break;
119     default:
120          pm_error(errorUndefinedMode, mode);
121     }
122 
123     for (inCurs = 0, outCurs = 0; inCurs < inSize; ) {
124         if ((inCurs < inSize - 1) && (inbuf[inCurs] == inbuf[inCurs+1])) {
125             /*Begin replicate run*/
126             size_t const hold = inCurs;
127             size_t count;
128             for (count = 0;
129                  inCurs < inSize &&
130                      inbuf[inCurs] == inbuf[hold] &&
131                      count < maxRun;
132                  ++inCurs, ++count)
133                 ;
134             outbuf[outCurs++] = (unsigned char) (packBase + packSign * count);
135             outbuf[outCurs++] = inbuf[hold];
136         } else {
137             /*Do a non-run*/
138             size_t const hold = outCurs;
139             size_t count;
140             ++outCurs;
141             count = 0;
142             while(((inCurs + 2 >= inSize) && (inCurs < inSize) ) ||
143                   ((inCurs + 2 <  inSize) &&
144                    ((inbuf[inCurs] != inbuf[inCurs+1]  )
145                     || (inbuf[inCurs] != inbuf[inCurs+2]  ) ) )    ) {
146                 outbuf[outCurs++] = inbuf[inCurs++];
147                 if (++count >= 128)
148                     break;
149             }
150             outbuf[hold] = (unsigned char)(count - 1);
151         }
152     }
153     *outputSizeP = outCurs;
154 }
155 
156 
157 
158 static void
setFlagElement(void * const outP,enum pm_RleMode const mode,bool const isRepeatRun,size_t const count)159 setFlagElement(void            * const outP,
160                enum pm_RleMode   const mode,
161                bool              const isRepeatRun,
162                size_t            const count) {
163 /*---------------------------------------------------------------------------
164   Write the flag byte or word at specified point in the output buffer.
165 -----------------------------------------------------------------------------*/
166     switch (mode) {
167     case PM_RLE_SGI16:
168         * (uint16_t *) outP =  (isRepeatRun ? 0x00 : 0x80 ) | count;
169         break;
170     case PM_RLE_PALM16:
171         * (unsigned char *) outP = isRepeatRun ?
172             (unsigned char)(257 - count) : (unsigned char) (count - 1);
173         break;
174     default:
175         pm_error(errorUndefinedMode, mode);
176     }
177 }
178 
179 
180 
181 void
pm_rlenc_compressword(const uint16_t * const inbuf,unsigned char * const outbuf,enum pm_RleMode const mode,size_t const inSize,size_t * const outputSizeP)182 pm_rlenc_compressword(const uint16_t   * const inbuf,
183                       unsigned char *    const outbuf,
184                       enum pm_RleMode    const mode,
185                       size_t             const inSize,
186                       size_t           * const outputSizeP) {
187 /*---------------------------------------------------------------------------
188    Similar to pm_rlenc_compressbyte(), but this works with two-byte elements.
189    The difference between SGI16 and PALM16 is the size of the flag.
190    SGI16 : 16 bits; PALM16 : 8 bits
191 
192    'inSize' is a number of words, but *outputSizeP is a number of bytes.
193 -----------------------------------------------------------------------------*/
194     size_t inCurs, outCurs;
195     size_t flagSz;
196     unsigned int maxRunSz;
197 
198     switch (mode) {
199     case PM_RLE_SGI16:
200         flagSz = 2;
201         maxRunSz = 127;
202         break;
203     case PM_RLE_PALM16:
204         flagSz = 1;
205         maxRunSz = 128;
206         break;
207     default:
208         pm_error(errorUndefinedMode, mode);
209     }
210 
211     for (inCurs = 0, outCurs = 0; inCurs < inSize; ) {
212         if ((inCurs < inSize - 1) && (inbuf[inCurs] == inbuf[inCurs+1])) {
213             uint16_t const runValue = inbuf[inCurs];
214             size_t count;
215             /* Do a run of 'runValue' values */
216             for (count = 0;
217                  count < maxRunSz &&
218                      inCurs < inSize &&
219                      inbuf[inCurs] == runValue;
220                  ++inCurs, ++count)
221                 ;
222             setFlagElement(&outbuf[outCurs], mode, TRUE, count);
223             outCurs += flagSz;
224             * (uint16_t *) &outbuf[outCurs] = runValue;
225             outCurs += 2;
226         } else {
227             /*Do a non run*/
228             size_t const nonrunStart = inCurs;
229             size_t count;
230             count = 0;
231             while (count < maxRunSz &&
232                    ((inCurs + 2 >= inSize && inCurs < inSize) ||
233                     (inCurs + 2 <  inSize &&
234                      (inbuf[inCurs] != inbuf[inCurs+1]
235                       || inbuf[inCurs] != inbuf[inCurs+2])))) {
236                 ++inCurs;
237                 ++count;
238             }
239             setFlagElement(&outbuf[outCurs], mode, FALSE, count);
240             outCurs += flagSz;
241             memcpy(&outbuf[outCurs], &inbuf[nonrunStart], count * 2);
242             outCurs += count * 2;
243         }
244     }
245 
246     if (mode == PM_RLE_SGI16) {
247         * (uint16_t *) &outbuf[outCurs] = 0;     /* terminator */
248         outCurs += 2;
249     }
250 
251     *outputSizeP = outCurs;
252 }
253 
254 
255 
256 /*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
257    Worst case output size
258 
259    The term "worst-case output size" can mean one of two things depending on
260    whether the encoder is efficient or not.
261 
262    Sub-optimal encoder: the output size can be up to twice the input size.
263    This happens (or one may rather say "is achieved by a determined encoder")
264    when input is split into one-byte blocks.
265 
266    Efficient encoder: the output is larger than the input by the minimum
267    number of flag bytes.  The worst case happens when there are no repeat
268    sequences in the input.
269 
270    The encoder in this file is an efficient one.
271 
272    The key to an efficient encoder is correct handling of short, inefficient
273    sequences.  The following algorithm (not actually used in this file)
274    describes the idea.
275 
276    A run is a maximal set of two or more consecutive identical values in the
277    input.  A nonrun is a maximal set of values in which every value is
278    different from its neighbors.
279 
280    A compressed block is one that encodes a sequence of identical values
281    (which could be a run or just part of a very long run) as a value and
282    a count.  An uncompressed block is one that encodes a sequence of any
283    values by listing the values individually.
284 
285    Start by encoding the entire input as uncompressed blocks.  Seek runs that
286    can be encoded with compressed blocks, but only if doing so doesn't make
287    the output larger.
288 
289    Criteria to avoid bloat:
290 
291      - Overhead for a single uncompressed block: 1 byte.
292 
293      - Overhead for one uncompressed block and a compressed block: 2 bytes.
294 
295      - Overhead for two uncompressed blocks and a compressed block: 3 bytes.
296 
297      - New blocks at the edge of any existing block add 1 byte of overhead.
298        New blocks in the middle of existing blocks add 2 bytes of overhead.
299 
300      - Whenever savings are larger than the added overhead, encode the run
301        as a compressed block.
302 
303    -----------------------------------------------------------------------
304 
305    MS-DOS/Windows BMP format note
306 
307    The compression method for BMP is a variant of packbits.
308    We could support the 8-bit version with some modifications to functions
309    in this file.  Determining the worst-case output size of an efficient
310    coder is rather complicated because uncompressed blocks may not be less
311    than three bytes long and are indicated by two-byte flags.
312 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
313 
314 
315 
316 size_t
pm_rlenc_maxbytes(size_t const inSize,enum pm_RleMode const mode)317 pm_rlenc_maxbytes(size_t          const inSize,  /* number of elements */
318                   enum pm_RleMode const mode) {
319 /*---------------------------------------------------------------------------
320    Calculate worst case output size from input size and encoding mode.
321 
322    'inSize' counts the number of elements, not bytes: input size is (2 *
323    inSize) bytes if input is an array of 16-bit words.
324 
325    Return value is the maximum possible output size in bytes regardless of
326    type of input.
327 
328    Abort the program if the maximum possible output size is greater than
329    INT_MAX.
330 -----------------------------------------------------------------------------*/
331    /* The upper limit could be raised above INT_MAX, but no program needs that
332       today.
333    */
334     size_t blockSize;  /* Number of items in data block */
335     size_t flagSize;   /* Size of flag, in bytes */
336     size_t itemSize;   /* Size of item, in bytes */
337     size_t miscSize;   /* Size of other elements such as term code, in bytes */
338     size_t overhead;   /* Worst-case overhead, in bytes */
339     /* return value:      Worst-case output size, in bytes */
340 
341     switch (mode) {
342     case PM_RLE_PACKBITS:
343     case PM_RLE_PALMPDB:
344         blockSize = 128; flagSize = 1; itemSize = 1; miscSize = 0;
345         break;
346     case PM_RLE_SGI8:
347         blockSize = 127; flagSize = 1; itemSize = 1; miscSize = 0;
348         break;
349     case PM_RLE_GRAPHON:
350     case PM_RLE_PPA:
351         blockSize =  64; flagSize = 1; itemSize = 1; miscSize = 0;
352         break;
353     case PM_RLE_SGI16:
354         blockSize = 127; flagSize = 2; itemSize = 2; miscSize = 2;
355         break;
356     case PM_RLE_PALM16:
357         blockSize = 128; flagSize = 1; itemSize = 2; miscSize = 0;
358         break;
359     default:
360         pm_error(errorUndefinedMode, mode);
361     }
362 
363     overhead = miscSize +
364         (inSize / blockSize + (inSize % blockSize > 0 ? 1 : 0) ) * flagSize;
365 
366     if (inSize > INT_MAX / itemSize ||
367         inSize * itemSize > INT_MAX - overhead)
368         pm_error("Cannot do RLE compression.  Input too large.");
369 
370     return inSize * itemSize + overhead;
371 }
372 
373 
374 
375 void
pm_rlenc_allocoutbuf(unsigned char ** const outbufP,size_t const inSize,enum pm_RleMode const mode)376 pm_rlenc_allocoutbuf(unsigned char ** const outbufP,
377                      size_t           const inSize,  /* element count */
378                      enum pm_RleMode  const mode) {
379 /*---------------------------------------------------------------------------
380    Allocate an output buffer sufficient for input with inSize elements, using
381    compression mode 'mode'.  Element may be byte or word, whichever 'mode'
382    implies.
383 -----------------------------------------------------------------------------*/
384     size_t const size = pm_rlenc_maxbytes(inSize, mode);
385 
386     unsigned char * outbuf;
387 
388     MALLOCARRAY(outbuf, size);
389     if (outbuf == NULL)
390         pm_error("Out of memory trying to get %u bytes for RLE output buffer",
391                  (unsigned)size);
392 
393     *outbufP = outbuf;
394 }
395 
396 
397 
398 void
pm_rlenc_freebuf(void * const buf)399 pm_rlenc_freebuf(void * const buf) {
400     free(buf);
401 }
402 
403 
404