1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 2003-2011 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Eclipse Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.eclipse.org/org/documents/epl-v10.html *
11 * (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Phong Vo <kpv@research.att.com> *
18 * *
19 ***********************************************************************/
20 #include <vclib.h>
21
22 /* Various run-length-encoding methods.
23 **
24 ** Written by Kiem-Phong Vo (kpv@research.att.com)
25 */
26
27 typedef struct _rle_s Rle_t;
28 typedef ssize_t (*Rle_f)_ARG_((Rle_t*, int));
29 static ssize_t rle0 _ARG_((Rle_t*, int));
30 static ssize_t rle1 _ARG_((Rle_t*, int));
31 static ssize_t rle2 _ARG_((Rle_t*, int));
32 static ssize_t rleg _ARG_((Rle_t*, int));
33
34 struct _rle_s
35 {
36 Rle_f rlef; /* rle function to call */
37 Vcchar_t* ibuf; /* default input data */
38 ssize_t isiz;
39
40 Vcchar_t* obuf; /* default output data */
41 ssize_t osiz;
42 Vcchar_t* endo; /* upper bound of array */
43
44 Vcchar_t* abuf; /* auxiliary buffer */
45 ssize_t asiz;
46
47 Vcchar_t run1; /* run-heads of rle2() */
48 Vcchar_t run2;
49 };
50
51 /* arguments to select type of run length coder */
52 static Vcmtarg_t _Rleargs[] =
53 { { "0", "Run-length-encoding: 0-sequences only.", (Void_t*)rle0 },
54 { "1", "Run-length-encoding: 0&1-sequences only.", (Void_t*)rle1 },
55 { "2", "Run-length-encoding: Alphabet has only two letters.", (Void_t*)rle2 },
56 { 0 , "General run-length-encoding.", (Void_t*)rleg }
57 };
58
59 /* Encoding 0-run's only. Useful for sequences undergone entropy reduction
60 transforms such as Burrows-Wheele + MTF or the column prediction method.
61 */
62 #if __STD_C
rle0(Rle_t * rle,int encoding)63 static ssize_t rle0(Rle_t* rle, int encoding)
64 #else
65 static ssize_t rle0(rle, encoding)
66 Rle_t* rle;
67 int encoding;
68 #endif
69 {
70 ssize_t c, z;
71 Vcchar_t *dt, *enddt, *o, *endo;
72 Vcio_t io;
73
74 enddt = (dt = rle->ibuf) + rle->isiz;
75 o = rle->obuf; rle->osiz = 0;
76
77 if(encoding)
78 { for(z = -1; dt < enddt; ++dt)
79 { if((c = *dt) == 0)
80 z += 1;
81 else
82 { if(z >= 0) /* encode the 0-run */
83 { /**/DEBUG_PRINT(9,"%d\n",z);
84 vcioinit(&io, o, 8*sizeof(ssize_t));
85 vcioput2(&io, z, 0, RL_ZERO);
86 o = vcionext(&io);
87 z = -1;
88 }
89 if(c >= RL_ZERO) /* literal encoding */
90 *o++ = RL_ESC;
91 *o++ = c;
92 }
93 }
94 if(z >= 0) /* final 0-run if any */
95 { /**/DEBUG_PRINT(9,"%d\n",z);
96 vcioinit(&io, o, 8*sizeof(ssize_t));
97 vcioput2(&io, z, 0, RL_ZERO);
98 o = vcionext(&io);
99 }
100 }
101 else
102 { for(endo = rle->endo; dt < enddt; )
103 { if((c = *dt++) == RL_ESC)
104 { if(dt >= enddt) /* premature EOF */
105 return -1;
106 c = *dt++;
107 if(o >= endo || c < RL_ZERO) /* corrupted data */
108 return -1;
109 *o++ = c;
110 }
111 else if(c == 0 || c == RL_ZERO)
112 { vcioinit(&io, dt-1, (enddt-dt)+1);
113 z = vcioget2(&io, 0, RL_ZERO); /**/DEBUG_PRINT(9,"%d\n",z);
114 dt = vcionext(&io);
115 if(o+z > endo)
116 return -1;
117 for(; z >= 0; --z)
118 *o++ = 0;
119 }
120 else
121 { if(o >= endo)
122 return -1;
123 *o++ = c;
124 }
125 }
126 }
127
128 return (rle->osiz = o - rle->obuf);
129 }
130
131 /* Like rle0() but including 1-run's as well as 0-runs */
132 #if __STD_C
rle1(Rle_t * rle,int encoding)133 static ssize_t rle1(Rle_t* rle, int encoding)
134 #else
135 static ssize_t rle1(rle, encoding)
136 Rle_t* rle;
137 int encoding;
138 #endif
139 {
140 ssize_t c, rc, rz;
141 Vcchar_t *dt, *enddt, *o, *endo;
142 Vcio_t io;
143
144 enddt = (dt = rle->ibuf) + rle->isiz;
145 o = rle->obuf; rle->osiz = 0;
146
147 if(encoding)
148 { for(rc = rz = -1; dt < enddt; ++dt)
149 { if((c = *dt) == 0 || c == 1)
150 { if(c == rc)
151 rz += 1;
152 else
153 { if(rz >= 0)
154 { vcioinit(&io, o, 8*sizeof(ssize_t));
155 if(rc == 0)
156 vcioput2(&io, rz, 0, RL_ZERO);
157 else vcioput2(&io, rz, 1, RL_ONE);
158 o = vcionext(&io);
159 }
160
161 rc = c; rz = 0;
162 }
163 }
164 else
165 { if(rz >= 0) /* encode the 0/1-run */
166 { vcioinit(&io, o, 8*sizeof(ssize_t));
167 if(rc == 0)
168 vcioput2(&io, rz, 0, RL_ZERO);
169 else vcioput2(&io, rz, 1, RL_ONE);
170 o = vcionext(&io);
171
172 rc = rz = -1;
173 }
174
175 if(c >= RL_ONE) /* literal encoding */
176 *o++ = RL_ESC;
177 *o++ = c;
178 }
179 }
180 if(rz >= 0) /* final 0/1-run if any */
181 { vcioinit(&io, o, 8*sizeof(ssize_t));
182 if(rc == 0)
183 vcioput2(&io, rz, 0, RL_ZERO);
184 else vcioput2(&io, rz, 1, RL_ONE);
185 o = vcionext(&io);
186 }
187 }
188 else
189 { for(endo = rle->endo; dt < enddt; )
190 { if((c = *dt++) == RL_ESC)
191 { if(dt >= enddt) /* premature EOF */
192 return -1;
193 c = *dt++;
194 if(o >= endo || c < RL_ONE) /* corrupted data */
195 return -1;
196 *o++ = c;
197 }
198 else if(c == 0 || c == RL_ZERO || c == 1 || c == RL_ONE)
199 { vcioinit(&io, dt-1, (enddt-dt)+1);
200 if(c == 0 || c == RL_ZERO)
201 rz = vcioget2(&io, 0, RL_ZERO);
202 else rz = vcioget2(&io, 1, RL_ONE);
203 dt = vcionext(&io);
204 if(o+rz > endo)
205 return -1;
206 if(c == 0 || c == RL_ZERO)
207 { for(; rz >= 0; --rz)
208 *o++ = 0;
209 }
210 else
211 { for(; rz >= 0; --rz)
212 *o++ = 1;
213 }
214 }
215 else
216 { if(o >= endo)
217 return -1;
218 *o++ = c;
219 }
220 }
221 }
222
223 return (rle->osiz = o - rle->obuf);
224 }
225
226 /* Encoding a sequence with at most two distinct letters */
227 #if __STD_C
rle2(Rle_t * rle,int encoding)228 static ssize_t rle2(Rle_t* rle, int encoding)
229 #else
230 static ssize_t rle2(rle, encoding)
231 Rle_t* rle;
232 int encoding;
233 #endif
234 {
235 Vcchar_t *dt, *enddt;
236 ssize_t sz, n, k, c, c1, c2;
237 Vcio_t io;
238
239 if(encoding)
240 { dt = rle->ibuf; sz = rle->isiz; /**/DEBUG_ASSERT(sz >= 1);
241 vcioinit(&io, rle->obuf, rle->osiz);
242
243 for(c1 = dt[0], n = 1; n < sz; ++n)
244 if(dt[n] != c1)
245 break;
246 if(n == sz)
247 c2 = c1; /* just a single run */
248 else
249 { c2 = dt[n];
250 for(c = c1, k = n; k < sz; ++k)
251 { if(dt[k] == c) /* current run continues */
252 n += 1;
253 else /* end of current run */
254 { if((c = dt[k]) != c1 && c != c2)
255 RETURN(-1);
256 vcioputu(&io, n-1);
257 n = 1;
258 }
259 }
260 }
261 vcioputu(&io, n-1); /* output length of last run */
262
263 rle->run1 = c1; rle->run2 = c2;
264 return rle->osiz = vciosize(&io);
265 }
266 else
267 { dt = rle->obuf; enddt = rle->endo;
268 c1 = rle->run1; c2 = rle->run2;
269 vcioinit(&io, rle->ibuf, rle->isiz);
270 for(c = c1; vciomore(&io) > 0;)
271 { for(n = vciogetu(&io); n >= 0 && dt < enddt; --n, ++dt)
272 *dt = c;
273 c = c == c1 ? c2 : c1; /* alternate run letters */
274 }
275 if(vciomore(&io) != 0)
276 RETURN(-1);
277
278 return (rle->osiz = dt - rle->obuf);
279 }
280 }
281
282 /* General rl-encoding using escape codes */
283 #if __STD_C
rleg(Rle_t * rle,int encoding)284 static ssize_t rleg(Rle_t* rle, int encoding)
285 #else
286 static ssize_t rleg(rle, encoding)
287 Rle_t* rle;
288 int encoding;
289 #endif
290 {
291 Vcchar_t c, *chr, *run, *dt, *enddt, *endb, *nextb;
292 ssize_t r;
293 Vcio_t io;
294
295 if(encoding)
296 { endb = (dt = rle->ibuf) + rle->isiz;
297
298 /* set buffers for runs and data */
299 chr = rle->obuf;
300 run = rle->abuf;
301 for(; dt < endb; dt = nextb)
302 { for(c = *dt, nextb = dt+1; nextb < endb; ++nextb)
303 if(*nextb != c)
304 break;
305 if((r = nextb-dt) >= 3 || c == RL_ESC)
306 { /* in-line small cases here for speed */
307 if(r < (1<<7))
308 *run++ = r;
309 else if(r < (1<<14))
310 { *run++ = (r>>7)|128;
311 *run++ = r&127;
312 }
313 else
314 { vcioinit(&io, run, 2*sizeof(ssize_t));
315 vcioputu(&io, r);
316 run = vcionext(&io);
317 }
318
319 *chr++ = RL_ESC;
320 if(c != RL_ESC || r > 1)
321 *chr++ = c;
322 }
323 else
324 { *chr++ = c;
325 if(r == 2)
326 *chr++ = c;
327 }
328 }
329
330 return (rle->osiz = chr - rle->obuf) + (rle->asiz = run - rle->abuf);
331 }
332 else
333 { dt = rle->obuf; enddt = rle->endo;
334 vcioinit(&io, rle->abuf, rle->asiz);
335 for(endb = (chr = rle->ibuf) + rle->isiz; chr < endb; )
336 { if((c = *chr++) != RL_ESC)
337 { if(dt >= enddt)
338 return -1;
339 *dt++ = c;
340 }
341 else
342 { r = vciogetu(&io);
343 if(dt+r > enddt)
344 return -1;
345 if(r == 1)
346 *dt++ = RL_ESC;
347 else for(c = *chr++; r > 0; --r)
348 *dt++ = c;
349 }
350 }
351
352 return (rle->osiz = dt - rle->obuf);
353 }
354 }
355
356
357 #if __STD_C
vcrle(Vcodex_t * vc,const Void_t * data,size_t size,Void_t ** out)358 static ssize_t vcrle(Vcodex_t* vc, const Void_t* data, size_t size, Void_t** out)
359 #else
360 static ssize_t vcrle(vc, data, size, out)
361 Vcodex_t* vc;
362 Void_t* data;
363 size_t size;
364 Void_t** out;
365 #endif
366 {
367 Vcchar_t *dt, *output, *space;
368 ssize_t k, hd, sz, outsz;
369 Vcio_t io;
370 Rle_t *rle = vcgetmtdata(vc, Rle_t*);
371
372 if(size == 0)
373 return 0;
374
375 /* input data to encode */
376 rle->ibuf = (Vcchar_t*)data;
377 rle->isiz = (ssize_t)size;
378
379 /* size of header in transformed data */
380 hd = vcsizeu(size);
381 if(rle->rlef == rle2)
382 hd += 2; /* for the two run bytes */
383
384 /* allocate buffers for transformed data */
385 if(rle->rlef == rleg) /* size for data + size/2 for lengths */
386 outsz = 2*vcsizeu(size) + size + size/2;
387 else if(rle->rlef == rle2)
388 outsz = size; /* binary coding never exceeds size */
389 else if(size < 64*1024*1024) /* if get here: rle0/1 coding */
390 outsz = 2*size; /* small, just overallocate */
391 else /* count the extra RL_ESC's that might be needed */
392 { for(dt = rle->ibuf, sz = 0, k = 0; k < size; ++k)
393 if(dt[k] >= RL_ONE)
394 sz += 1;
395 outsz = size + sz;
396 }
397 if(!(output = space = vcbuffer(vc, NIL(Vcchar_t*), outsz+128, hd)) )
398 RETURN(-1);
399
400 if(rle->rlef == rleg)
401 { rle->obuf = output + vcsizeu(size);
402 rle->endo = rle->obuf + (rle->osiz = size);
403 rle->abuf = rle->endo + vcsizeu(size);
404 rle->asiz = size/2;
405
406 if((sz = (*rle->rlef)(rle, 1)) < 0 )
407 RETURN(-1);
408
409 if(vc->coder) /* run continuator on the two parts */
410 { if(vcrecode(vc, &rle->obuf, &rle->osiz, 0, 0) < 0)
411 RETURN(-1);
412 if(vcrecode(vc, &rle->abuf, &rle->asiz, 0, 0) < 0)
413 RETURN(-1);
414 }
415
416 sz = vcsizeu(rle->osiz) + rle->osiz + vcsizeu(rle->asiz) + rle->asiz;
417 if(sz > outsz)
418 output = vcbuffer(vc, NIL(Vcchar_t*), sz, hd);
419
420 vcioinit(&io, output, sz);
421
422 vcioputu(&io, rle->osiz);
423 if(rle->obuf != vcionext(&io))
424 vcioputs(&io, rle->obuf, rle->osiz);
425 else vcioskip(&io, rle->osiz);
426
427 vcioputu(&io, rle->asiz);
428 if(rle->abuf != vcionext(&io))
429 vcioputs(&io, rle->abuf, rle->asiz);
430 else vcioskip(&io, rle->asiz);
431
432 sz = vciosize(&io);
433 }
434 else
435 { rle->obuf = output;
436 rle->endo = rle->obuf + outsz;
437
438 if((sz = (*rle->rlef)(rle, 1)) < 0 )
439 RETURN(-1);
440
441 if(vcrecode(vc, &output, &sz, hd, 0) < 0 )
442 RETURN(-1);
443 }
444
445 if(space != output) /* free space if unused */
446 vcbuffer(vc, space, -1, -1);
447
448 output -= hd; sz += hd;
449 vcioinit(&io, output, hd);
450 vcioputu(&io, size);
451
452 if(rle->rlef == rle2)
453 { vcioputc(&io, rle->run1);
454 vcioputc(&io, rle->run2);
455 }
456
457 if(!(output = vcbuffer(vc, output, sz, -1)) ) /* truncate buffer to size */
458 RETURN(-1);
459 if(out)
460 *out = output;
461 return sz;
462 }
463
464 #if __STD_C
vcunrle(Vcodex_t * vc,const Void_t * data,size_t size,Void_t ** out)465 static ssize_t vcunrle(Vcodex_t* vc, const Void_t* data, size_t size, Void_t** out)
466 #else
467 static ssize_t vcunrle(vc, data, size, out)
468 Vcodex_t* vc;
469 Void_t* data;
470 size_t size;
471 Void_t** out;
472 #endif
473 {
474 Vcchar_t *output;
475 ssize_t sz;
476 Vcio_t io;
477 Rle_t *rle = vcgetmtdata(vc, Rle_t*);
478
479 if(size == 0)
480 return 0;
481
482 rle->ibuf = rle->abuf = NIL(Vcchar_t*);
483 rle->isiz = rle->asiz = 0;
484
485 vcioinit(&io, data, size);
486 if((sz = vciogetu(&io)) <= 0 )
487 RETURN(-1);
488
489 if(rle->rlef == rle2) /* get the two run bytes */
490 { if(vciomore(&io) < 2)
491 RETURN(-1);
492 rle->run1 = vciogetc(&io);
493 rle->run2 = vciogetc(&io);
494 }
495
496 if(!(output = vcbuffer(vc, (Vcchar_t*)0, sz, 0)) )
497 RETURN(-1);
498 rle->obuf = output;
499 rle->endo = rle->obuf + sz;
500
501 if(rle->rlef == rleg)
502 { if(vciomore(&io) <= 0 || (rle->isiz = vciogetu(&io)) < 0)
503 RETURN(-1);
504 if(vciomore(&io) < rle->isiz)
505 RETURN(-1);
506 rle->ibuf = vcionext(&io);
507 vcioskip(&io, rle->isiz);
508
509 if(vciomore(&io) <= 0 || (rle->asiz = vciogetu(&io)) < 0)
510 RETURN(-1);
511 if(vciomore(&io) < rle->asiz)
512 RETURN(-1);
513 rle->abuf = vcionext(&io);
514
515 /* decode data by secondary encoders */
516 if(vcrecode(vc, &rle->ibuf, &rle->isiz, 0, 0) < 0)
517 RETURN(-1);
518 if(vcrecode(vc, &rle->abuf, &rle->asiz, 0, 0) < 0)
519 RETURN(-1);
520 }
521 else
522 { if(vciomore(&io) <= 0 || (rle->isiz = vciomore(&io)) < 0 )
523 RETURN(-1);
524 rle->ibuf = vcionext(&io);
525 if(vcrecode(vc, &rle->ibuf, &rle->isiz, 0, 0) < 0)
526 RETURN(-1);
527 }
528
529 if((*rle->rlef)(rle, 0) != sz) /* decode run data */
530 RETURN(-1);
531
532 /* free temporary data */
533 if(rle->ibuf && (rle->ibuf < (Vcchar_t*)data || rle->ibuf >= (Vcchar_t*)data+size) )
534 vcbuffer(vc, rle->ibuf, -1, -1);
535 if(rle->abuf && (rle->abuf < (Vcchar_t*)data || rle->abuf >= (Vcchar_t*)data+size) )
536 vcbuffer(vc, rle->abuf, -1, -1);
537
538 if(out)
539 *out = output;
540 return sz;
541 }
542
543 #if __STD_C
rleextract(Vcodex_t * vc,Vcchar_t ** datap)544 static ssize_t rleextract(Vcodex_t* vc, Vcchar_t** datap)
545 #else
546 static ssize_t rleextract(vc, datap)
547 Vcodex_t* vc;
548 Vcchar_t** datap; /* basis string for persistence */
549 #endif
550 {
551 Vcmtarg_t *arg;
552 char *ident;
553 ssize_t n;
554 Rle_t *rle = vcgetmtdata(vc, Rle_t*);
555
556 for(arg = _Rleargs;; ++arg)
557 if(!arg->name || arg->data == (Void_t*)rle->rlef)
558 break;
559 if(!arg->name)
560 return 0;
561
562 n = strlen(arg->name);
563 if(!(ident = (char*)vcbuffer(vc, NIL(Vcchar_t*), sizeof(int)*n+1, 0)) )
564 RETURN(-1);
565 if(!(ident = vcstrcode(arg->name, ident, sizeof(int)*n+1)) )
566 RETURN(-1);
567 if(datap)
568 *datap = (Void_t*)ident;
569 return n;
570 }
571
572 #if __STD_C
rlerestore(Vcchar_t * data,ssize_t dtsz)573 static Vcodex_t* rlerestore(Vcchar_t* data, ssize_t dtsz)
574 #else
575 static Vcodex_t* rlerestore(data, dtsz)
576 Vcchar_t* data; /* persistence data */
577 ssize_t dtsz;
578 #endif
579 {
580 Vcmtarg_t *arg;
581 char *ident, buf[1024];
582
583 for(arg = _Rleargs; arg->name; ++arg)
584 { if(!(ident = vcstrcode(arg->name, buf, sizeof(buf))) )
585 return NIL(Vcodex_t*);
586 if(dtsz == strlen(ident) && strncmp(ident, (Void_t*)data, dtsz) == 0)
587 break;
588 }
589 return vcopen(0, Vcrle, (Void_t*)arg->name, 0, VC_DECODE);
590 }
591
592 #if __STD_C
rleevent(Vcodex_t * vc,int type,Void_t * params)593 static int rleevent(Vcodex_t* vc, int type, Void_t* params)
594 #else
595 static int rleevent(vc, type, params)
596 Vcodex_t* vc;
597 int type;
598 Void_t* params;
599 #endif
600 {
601 Rle_t *rle;
602 Vcmtarg_t *arg;
603 Vcmtcode_t *mtcd;
604 char *data;
605
606 if(type == VC_OPENING )
607 { for(arg = NIL(Vcmtarg_t*), data = (char*)params; data && *data; )
608 { data = vcgetmtarg(data, 0, 0, _Rleargs, &arg);
609 if(arg && arg->name)
610 break;
611 }
612 if(!arg)
613 for(arg = _Rleargs;; ++arg)
614 if(!arg->name)
615 break;
616
617 if(!(rle = (Rle_t*)calloc(1,sizeof(Rle_t))) )
618 RETURN(-1);
619 rle->rlef = (Rle_f)arg->data;
620
621 vcsetmtdata(vc, rle);
622 return 0;
623 }
624 else if(type == VC_CLOSING)
625 { if((rle = vcgetmtdata(vc, Rle_t*)) )
626 free(rle);
627 return 0;
628 }
629 else if(type == VC_EXTRACT)
630 { if(!(mtcd = (Vcmtcode_t*)params) )
631 RETURN(-1);
632 if((mtcd->size = rleextract(vc, &mtcd->data)) < 0 )
633 RETURN(-1);
634 return 1;
635 }
636 else if(type == VC_RESTORE)
637 { if(!(mtcd = (Vcmtcode_t*)params) )
638 RETURN(-1);
639 if(!(mtcd->coder = rlerestore(mtcd->data, mtcd->size)) )
640 RETURN(-1);
641 return 1;
642 }
643
644 return 0;
645 }
646
647 Vcmethod_t _Vcrle =
648 { vcrle,
649 vcunrle,
650 rleevent,
651 "rle", "Run-length encoding.",
652 "[-version?rle (AT&T Research) 2003-01-01]" USAGE_LICENSE,
653 _Rleargs,
654 1024*1024,
655 0
656 };
657
658 VCLIB(Vcrle)
659