1 /*
2 * Copyright (c) 2005-2010, 2013 Genome Research Ltd.
3 * Author(s): James Bonfield
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following
13 * disclaimer in the documentation and/or other materials provided
14 * with the distribution.
15 *
16 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17 * Institute nor the names of its contributors may be used to endorse
18 * or promote products derived from this software without specific
19 * prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*
35 * Author(s): James Bonfield
36 *
37 * Copyright (c) 2001-2002 MEDICAL RESEARCH COUNCIL
38 * All rights reserved
39 *
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions are met:
42 *
43 * 1 Redistributions of source code must retain the above copyright notice,
44 * this list of conditions and the following disclaimer.
45 *
46 * 2 Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in
48 * the documentation and/or other materials provided with the
49 * distribution.
50 *
51 * 3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
52 * MOLECULAR BIOLOGY nor the names of its contributors may be used
53 * to endorse or promote products derived from this software without
54 * specific prior written permission.
55 *
56 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
57 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
60 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
61 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
62 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
63 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
64 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
65 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
66 * POSSIBILITY OF SUCH DAMAGE.
67 */
68
69 #ifdef HAVE_CONFIG_H
70 #include "io_lib_config.h"
71 #endif
72
73 #include <stdio.h>
74 #include <string.h>
75 #include <math.h>
76
77 /* #include <fcntl.h> */
78
79 #include "io_lib/ztr.h"
80 #include "io_lib/xalloc.h"
81 #include "io_lib/Read.h"
82 #include "io_lib/compression.h"
83 #include "io_lib/stdio_hack.h"
84 #include "io_lib/deflate_interlaced.h"
85
86 /* Deprecated #define - see solexa2srf for the more up to date code */
87 /* #define ILLUMINA_GA */
88
89 /*
90 * ---------------------------------------------------------------------------
91 * Trace writing functions.
92 * These consist of several encoding functions, all with the same prototype,
93 * and a single fwrite_ztr function to wrap it all up.
94 * ---------------------------------------------------------------------------
95 */
96
97 /*
98 * ztr_write_header
99 *
100 * Writes a ZTR file header.
101 *
102 * Arguments:
103 * fp A FILE pointer
104 * h A pointer to the header to write
105 *
106 * Returns:
107 * Success: 0
108 * Failure: -1
109 */
ztr_write_header(FILE * fp,ztr_header_t * h)110 static int ztr_write_header(FILE *fp, ztr_header_t *h) {
111 if (1 != fwrite(h, sizeof(*h), 1, fp))
112 return -1;
113
114 return 0;
115 }
116
117 /*
118 * ztr_write_chunk
119 *
120 * Writes a ZTR chunk including chunk header and data
121 *
122 * Arguments:
123 * fp A FILE pointer
124 * chunk A pointer to the chunk to write
125 *
126 * Returns:
127 * Success: 0
128 * Failure: -1
129 */
ztr_write_chunk(FILE * fp,ztr_chunk_t * chunk)130 static int ztr_write_chunk(FILE *fp, ztr_chunk_t *chunk) {
131 int4 bei4;
132
133 /*
134 {
135 char str[5];
136 fprintf(stderr, "Write chunk %.4s %08x length %d\n",
137 ZTR_BE2STR(chunk->type, str), chunk->type, chunk->dlength);
138 }
139 */
140
141 /* type */
142 bei4 = be_int4(chunk->type);
143 if (1 != fwrite(&bei4, 4, 1, fp))
144 return -1;
145
146 /* metadata length */
147 bei4 = be_int4(chunk->mdlength);
148 if (1 != fwrite(&bei4, 4, 1, fp))
149 return -1;
150
151 /* metadata */
152 if (chunk->mdlength)
153 if (chunk->mdlength != fwrite(chunk->mdata, 1, chunk->mdlength, fp))
154 return -1;
155
156 /* data length */
157 bei4 = be_int4(chunk->dlength);
158 if (1 != fwrite(&bei4, 4, 1, fp))
159 return -1;
160
161 /* data */
162 if (chunk->dlength != fwrite(chunk->data, 1, chunk->dlength, fp))
163 return -1;
164
165 return 0;
166 }
167
168 /*
169 * fwrite_ztr
170 *
171 * Writes a ZTR file held in the ztr_t structure.
172 * It is assumed that all the correct lengths, magic numbers, etc in the
173 * ztr_t struct have already been initialised correctly.
174 *
175 * FIXME: Add a 'method' argument which encodes formats? Perhaps store this
176 * in the ztr struct?
177 *
178 * Arguments:
179 * fp A writable FILE pointer
180 * ztr A pointer to the ztr_t struct to write.
181 *
182 * Returns:
183 * Success: 0
184 * Failure: -1
185 */
fwrite_ztr(FILE * fp,ztr_t * ztr)186 int fwrite_ztr(FILE *fp, ztr_t *ztr) {
187 int i;
188
189 /* Write the header record */
190 if (-1 == ztr_write_header(fp, &ztr->header))
191 return -1;
192
193 /* Write the chunks */
194 for (i = 0; i < ztr->nchunks; i++) {
195 if (-1 == ztr_write_chunk(fp, &ztr->chunk[i]))
196 return -1;
197 #if 0
198 {
199 int fd;
200 char fname[1024];
201 sprintf(fname, "chunk.%d", i);
202 fd = open(fname, O_RDWR|O_CREAT|O_TRUNC, 0666);
203 write(fd, ztr->chunk[i].data, ztr->chunk[i].dlength);
204 close(fd);
205 }
206 #endif
207 }
208
209 return 0;
210 }
211
212 /*
213 * ---------------------------------------------------------------------------
214 * Trace reading functions.
215 * These consist of several decoding functions, all with the same prototype,
216 * and a single fread_ztr function to wrap it all up.
217 * ---------------------------------------------------------------------------
218 */
219
220 /*
221 * ztr_read_header
222 *
223 * Reads a ZTR file header.
224 *
225 * Arguments:
226 * fp A FILE pointer
227 * h Where to write the header to
228 *
229 * Returns:
230 * Success: 0
231 * Failure: -1
232 */
ztr_read_header(FILE * fp,ztr_header_t * h)233 int ztr_read_header(FILE *fp, ztr_header_t *h) {
234 if (1 != fread(h, sizeof(*h), 1, fp))
235 return -1;
236
237 return 0;
238 }
239
240 /*
241 * ztr_read_chunk_hdr
242 *
243 * Reads a ZTR chunk header and metadata, but not the main data segment.
244 *
245 * Arguments:
246 * fp A FILE pointer
247 *
248 * Returns:
249 * Success: a chunk pointer (malloced)
250 * Failure: NULL
251 */
ztr_read_chunk_hdr(FILE * fp)252 ztr_chunk_t *ztr_read_chunk_hdr(FILE *fp) {
253 int4 bei4;
254 ztr_chunk_t *chunk;
255
256 if (NULL == (chunk = (ztr_chunk_t *)xmalloc(sizeof(*chunk))))
257 return NULL;
258
259 /* type */
260 if (1 != fread(&bei4, 4, 1, fp)) {
261 xfree(chunk);
262 return NULL;
263 }
264 chunk->type = be_int4(bei4);
265
266 /* metadata length */
267 if (1 != fread(&bei4, 4, 1, fp)) {
268 xfree(chunk);
269 return NULL;
270 }
271 chunk->mdlength = be_int4(bei4);
272
273 /* metadata */
274 chunk->ztr_owns = 1;
275 if (chunk->mdlength) {
276 if (NULL == (chunk->mdata = (char *)xmalloc(chunk->mdlength))) {
277 xfree(chunk);
278 return NULL;
279 }
280 if (chunk->mdlength != fread(chunk->mdata, 1, chunk->mdlength, fp)) {
281 xfree(chunk->mdata);
282 xfree(chunk);
283 return NULL;
284 }
285 } else {
286 chunk->mdata = NULL;
287 }
288
289 /* data length */
290 if (1 != fread(&bei4, 4, 1, fp)) {
291 if (chunk->mdata)
292 xfree(chunk->mdata);
293 xfree(chunk);
294 return NULL;
295 }
296 chunk->dlength = be_int4(bei4);
297
298 return chunk;
299 }
300
ztr_process_text(ztr_t * ztr)301 void ztr_process_text(ztr_t *ztr) {
302 int i;
303 ztr_chunk_t **text_chunks = NULL;
304 int ntext_chunks = 0;
305 ztr_text_t *zt = NULL;
306 int nzt = 0;
307 int nalloc = 0;
308
309 if (ztr->text_segments)
310 /* Already done */
311 return;
312
313 text_chunks = ztr_find_chunks(ztr, ZTR_TYPE_TEXT, &ntext_chunks);
314 if (!text_chunks)
315 return;
316
317 for (i = 0; i < ntext_chunks; i++) {
318 char *data;
319 uint4 length;
320 char *ident, *value;
321
322 /* Make sure it's not compressed */
323 uncompress_chunk(ztr, text_chunks[i]);
324
325 data = text_chunks[i]->data;
326 length = text_chunks[i]->dlength;
327
328 if (!length)
329 continue;
330
331 /* Skip RAW header byte */
332 data++;
333 length--;
334
335 while (data - text_chunks[i]->data <= (ptrdiff_t)length &&
336 *(ident = data)) {
337 data += strlen(ident)+1;
338 value = data;
339 if (value)
340 data += strlen(value)+1;
341
342 if (nzt + 1 > nalloc) {
343 nalloc += 10;
344 zt = (ztr_text_t *)xrealloc(zt, nalloc * sizeof(*zt));
345 }
346 zt[nzt].ident = ident;
347 zt[nzt].value = value;
348 nzt++;
349 }
350 }
351
352 ztr->text_segments = zt;
353 ztr->ntext_segments = nzt;
354
355 /*
356 for (i = 0; i < ztr->ntext_segments; i++) {
357 fprintf(stderr, "'%s' = '%s'\n",
358 ztr->text_segments[i].ident,
359 ztr->text_segments[i].value);
360 }
361 */
362
363 xfree(text_chunks);
364 }
365
366
367 /*
368 * fread_ztr
369 *
370 * Reads a ZTR file from 'fp'. This checks for the correct magic number and
371 * major version number, but not minor version number.
372 *
373 * FIXME: Add automatic uncompression?
374 *
375 * Arguments:
376 * fp A readable FILE pointer
377 *
378 * Returns:
379 * Success: Pointer to a ztr_t structure (malloced)
380 * Failure: NULL
381 */
fread_ztr(FILE * fp)382 ztr_t *fread_ztr(FILE *fp) {
383 ztr_t *ztr;
384 ztr_chunk_t *chunk;
385 int sections = read_sections(0);
386
387 /* Allocate */
388 if (NULL == (ztr = new_ztr()))
389 return NULL;
390
391 /* Read the header */
392 if (-1 == ztr_read_header(fp, &ztr->header))
393 return NULL;
394
395 /* Check magic number and version */
396 if (memcmp(ztr->header.magic, ZTR_MAGIC, 8) != 0)
397 return NULL;
398
399 if (ztr->header.version_major != ZTR_VERSION_MAJOR)
400 return NULL;
401
402 /* Load chunks */
403 while ((chunk = ztr_read_chunk_hdr(fp))) {
404 /*
405 char str[5];
406
407 fprintf(stderr, "Read chunk %.4s %08x length %d\n",
408 ZTR_BE2STR(chunk->type, str), chunk->type, chunk->dlength);
409 */
410 switch(chunk->type) {
411 case ZTR_TYPE_HEADER:
412 /* End of file */
413 return ztr;
414
415 case ZTR_TYPE_SAMP:
416 case ZTR_TYPE_SMP4:
417 if (! (sections & READ_SAMPLES)) {
418 fseek(fp, chunk->dlength, SEEK_CUR);
419 xfree(chunk);
420 continue;
421 }
422 break;
423
424
425 case ZTR_TYPE_BASE:
426 case ZTR_TYPE_BPOS:
427 case ZTR_TYPE_CNF4:
428 case ZTR_TYPE_CNF1:
429 case ZTR_TYPE_CSID:
430 if (! (sections & READ_BASES)) {
431 fseek(fp, chunk->dlength, SEEK_CUR);
432 xfree(chunk);
433 continue;
434 }
435 break;
436
437 case ZTR_TYPE_TEXT:
438 if (! (sections & READ_COMMENTS)) {
439 fseek(fp, chunk->dlength, SEEK_CUR);
440 xfree(chunk);
441 continue;
442 }
443 break;
444
445 case ZTR_TYPE_CLIP:
446 case ZTR_TYPE_FLWO:
447 case ZTR_TYPE_FLWC:
448 break;
449
450 /*
451 default:
452 fprintf(stderr, "Unknown chunk type '%s': skipping\n",
453 ZTR_BE2STR(chunk->type,str));
454 fseek(fp, chunk->dlength, SEEK_CUR);
455 xfree(chunk);
456 continue;
457 */
458 }
459
460 chunk->ztr_owns = 1;
461 chunk->data = (char *)xmalloc(chunk->dlength);
462 if (chunk->dlength != fread(chunk->data, 1, chunk->dlength, fp)) {
463 delete_ztr(ztr);
464 return NULL;
465 }
466
467 ztr->nchunks++;
468 ztr->chunk = (ztr_chunk_t *)xrealloc(ztr->chunk, ztr->nchunks *
469 sizeof(ztr_chunk_t));
470 memcpy(&ztr->chunk[ztr->nchunks-1], chunk, sizeof(*chunk));
471 xfree(chunk);
472 }
473
474 return ztr;
475 }
476
477 /*
478 * ---------------------------------------------------------------------------
479 * Other utility functions
480 * ---------------------------------------------------------------------------
481 */
482 /*
483 * new_ztr
484 *
485 * Allocates and initialises a ztr_t structure
486 *
487 * Returns:
488 * ztr_t pointer on success
489 * NULL on failure
490 */
new_ztr(void)491 ztr_t *new_ztr(void) {
492 ztr_t *ztr;
493
494 /* Allocate */
495 if (NULL == (ztr = (ztr_t *)xmalloc(sizeof(*ztr))))
496 return NULL;
497
498 ztr->chunk = NULL;
499 ztr->nchunks = 0;
500 ztr->text_segments = NULL;
501 ztr->ntext_segments = 0;
502 ztr->delta_level = 3;
503
504 ztr->nhcodes = 0;
505 ztr->hcodes = NULL;
506 ztr->hcodes_checked = 0;
507
508 return ztr;
509 }
510
delete_ztr(ztr_t * ztr)511 void delete_ztr(ztr_t *ztr) {
512 int i;
513
514 if (!ztr)
515 return;
516
517 if (ztr->chunk) {
518 for (i = 0; i < ztr->nchunks; i++) {
519 if (ztr->chunk[i].data && ztr->chunk[i].ztr_owns)
520 xfree(ztr->chunk[i].data);
521 if (ztr->chunk[i].mdata && ztr->chunk[i].ztr_owns)
522 xfree(ztr->chunk[i].mdata);
523 }
524 xfree(ztr->chunk);
525 }
526
527 if (ztr->hcodes) {
528 for (i = 0; i < ztr->nhcodes; i++) {
529 if (ztr->hcodes[i].codes && ztr->hcodes[i].ztr_owns)
530 huffman_codeset_destroy(ztr->hcodes[i].codes);
531 }
532 free(ztr->hcodes);
533 }
534
535 if (ztr->text_segments)
536 xfree(ztr->text_segments);
537
538 xfree(ztr);
539 }
540
541 /*
542 * ztr_find_chunks
543 *
544 * Searches for chunks of a specific type.
545 *
546 * Returns:
547 * Array of ztr_chunk_t pointers (into the ztr struct). This is
548 * allocated by malloc and it is the callers duty to free this.
549 * NULL if none found.
550 */
ztr_find_chunks(ztr_t * ztr,uint4 type,int * nchunks_p)551 ztr_chunk_t **ztr_find_chunks(ztr_t *ztr, uint4 type, int *nchunks_p) {
552 ztr_chunk_t **chunks = NULL;
553 int nchunks = 0;
554 int i;
555
556 for (i = 0; i < ztr->nchunks; i++) {
557 if (ztr->chunk[i].type == type) {
558 chunks = (ztr_chunk_t **)xrealloc(chunks, (nchunks + 1) *
559 sizeof(*chunks));
560 chunks[nchunks++] = &ztr->chunk[i];
561 }
562 }
563 *nchunks_p = nchunks;
564 return chunks;
565 }
566
567 /*
568 * Shannon showed that for storage in base 'b' with alphabet symbols 'a' having
569 * a probability of ocurring in any context of 'Pa' we should encode
570 * symbol 'a' to have a storage width of -logb(Pa).
571 *
572 * Eg. b = 26, P(e) = .22. => width .4647277.
573 *
574 * We use this to calculate the entropy of a signal by summing over all letters
575 * in the signal. In this case, our storage has base 256.
576 */
577 #define EBASE 256
entropy(unsigned char * data,int len)578 static double entropy(unsigned char *data, int len) {
579 double E[EBASE];
580 double P[EBASE];
581 double e;
582 int i;
583
584 for (i = 0; i < EBASE; i++)
585 P[i] = 0;
586
587 for (i = 0; i < len; i++)
588 P[data[i]]++;
589
590 for (i = 0; i < EBASE; i++) {
591 if (P[i]) {
592 P[i] /= len;
593 E[i] = -(log(P[i])/log(EBASE));
594 } else {
595 E[i] = 0;
596 }
597 }
598
599 for (e = i = 0; i < len; i++)
600 e += E[data[i]];
601
602 return e;
603 }
604
605 /*
606 * Adds a user-defined huffman_codeset_t code-set to the available code sets
607 * used by huffman_encode or huffman_decode.
608 *
609 * Note that the 'codes' memory is then "owned" by the ztr object if "ztr_owns"
610 * is true and will be deallocated when the ztr object is destroyed. Otherwise
611 * freeing the ztr object will not touch the passed in codes.
612 */
ztr_add_hcode(ztr_t * ztr,huffman_codeset_t * codes,int ztr_owns)613 ztr_hcode_t *ztr_add_hcode(ztr_t *ztr, huffman_codeset_t *codes,
614 int ztr_owns) {
615 if (!codes)
616 return NULL;
617
618 ztr->hcodes = realloc(ztr->hcodes, (ztr->nhcodes+1)*sizeof(*ztr->hcodes));
619 ztr->hcodes[ztr->nhcodes].codes = codes;
620 ztr->hcodes[ztr->nhcodes].ztr_owns = ztr_owns;
621
622 return &ztr->hcodes[ztr->nhcodes++];
623 }
624
625 /*
626 * Searches through the cached huffman_codeset_t tables looking for a stored
627 * huffman code of type 'code_set'.
628 * NB: only code_sets >= CODE_USER will be stored here.
629 *
630 * Returns codes on success,
631 * NULL on failure
632 */
ztr_find_hcode(ztr_t * ztr,int code_set)633 ztr_hcode_t *ztr_find_hcode(ztr_t *ztr, int code_set) {
634 int i;
635
636 if (code_set < CODE_USER)
637 return NULL; /* computed on-the-fly or use a hard-coded set */
638
639 /* Check through chunks for undecoded HUFF chunks */
640 if (!ztr->hcodes_checked) {
641 for (i = 0; i < ztr->nchunks; i++) {
642 if (ztr->chunk[i].type == ZTR_TYPE_HUFF) {
643 block_t *blk;
644 huffman_codeset_t *cs;
645 uncompress_chunk(ztr, &ztr->chunk[i]);
646 blk = block_create((unsigned char *)(ztr->chunk[i].data+2),
647 ztr->chunk[i].dlength-2);
648 cs = restore_codes(blk, NULL);
649 if (!cs) {
650 block_destroy(blk, 1);
651 return NULL;
652 }
653 cs->code_set = (unsigned char)(ztr->chunk[i].data[1]);
654 ztr_add_hcode(ztr, cs, 1);
655 block_destroy(blk, 1);
656 }
657 }
658 ztr->hcodes_checked = 1;
659 }
660
661 /* Check cached copies */
662 for (i = 0; i < ztr->nhcodes; i++) {
663 if (ztr->hcodes[i].codes->code_set == code_set)
664 return &ztr->hcodes[i];
665 }
666
667 return NULL;
668 }
669
ztr_find_hcode_chunk(ztr_t * ztr,int code_set)670 ztr_chunk_t *ztr_find_hcode_chunk(ztr_t *ztr, int code_set) {
671 int i;
672
673 if (code_set < CODE_USER)
674 return NULL; /* computed on-the-fly or use a hard-coded set */
675
676 /* Check through chunks for undecoded HUFF chunks */
677 for (i = 0; i < ztr->nchunks; i++) {
678 if (ztr->chunk[i].type == ZTR_TYPE_HUFF) {
679 uncompress_chunk(ztr, &ztr->chunk[i]);
680 if (ztr->chunk[i].dlength >= 2 &&
681 (unsigned char)ztr->chunk[i].data[1] == code_set)
682 return &ztr->chunk[i];
683 }
684 }
685
686 return NULL;
687 }
688
689 /*
690 * Adds a new chunk to a ztr file and returns the chunk pointer.
691 * The data and mdata fields can be NULL and the chunk will not be
692 * initialised.
693 *
694 * Returns new chunk ptr on success.
695 * NULL on failure.
696 */
ztr_new_chunk(ztr_t * ztr,uint4 type,char * data,uint4 dlength,char * mdata,uint4 mdlength)697 ztr_chunk_t *ztr_new_chunk(ztr_t *ztr, uint4 type,
698 char *data, uint4 dlength,
699 char *mdata, uint4 mdlength) {
700 ztr_chunk_t *chunks, *c;
701
702 /* Grow the chunk array */
703 chunks = (ztr_chunk_t *)realloc(ztr->chunk,
704 (ztr->nchunks+1) * sizeof(*chunks));
705 if (!chunks)
706 return NULL;
707 ztr->chunk = chunks;
708
709 /* Initialise */
710 c = &ztr->chunk[ztr->nchunks++];
711 c->type = type;
712 c->data = data;
713 c->dlength = dlength;
714 c->mdata = mdata;
715 c->mdlength = mdlength;
716 c->ztr_owns = 1;
717
718 return c;
719 }
720
721 /*
722 * Adds a key/value pair to a ztr TEXT chunk.
723 * The 'ch' chunk may be explicitly specified in which case the text
724 * is added to that chunk or it may be specified as NULL in which case
725 * the key/value pair are added to the first available TEXT chunk,
726 * possibly creating a new one if required.
727 *
728 * NOTE: If the key already exists in the text chunk this appends a new
729 * copy; it does not overwrite the old one.
730 *
731 * Returns ztr text chunk ptr for success
732 * NULL for failure
733 */
ztr_add_text(ztr_t * z,ztr_chunk_t * ch,const char * key,const char * value)734 ztr_chunk_t *ztr_add_text(ztr_t *z, ztr_chunk_t *ch,
735 const char *key, const char *value) {
736 ztr_chunk_t **text_chunks = NULL;
737 int ntext_chunks;
738 size_t key_len, value_len;
739 char *cp;
740
741 /* Find/create the appropriate chunk */
742 if (!ch) {
743 text_chunks = ztr_find_chunks(z, ZTR_TYPE_TEXT, &ntext_chunks);
744 if (!text_chunks) {
745 ch = ztr_new_chunk(z, ZTR_TYPE_TEXT, NULL, 0, NULL, 0);
746 } else {
747 ch = text_chunks[0];
748 xfree(text_chunks);
749 }
750 }
751
752 if (ch->type != ZTR_TYPE_TEXT)
753 return NULL;
754
755 /* Make sure it's not compressed */
756 uncompress_chunk(z, ch);
757
758 /* Append key\0value\0 */
759 key_len = strlen(key);
760 value_len = strlen(value);
761 cp = ch->data;
762 if (cp) {
763 /* Set ch->dlength to the last non-nul byte of the previous value */
764 while (ch->dlength && ch->data[ch->dlength-1] == 0)
765 ch->dlength--;
766 }
767
768 cp = realloc(ch->data, 1 + ch->dlength + key_len + value_len + 3);
769 if (NULL == cp)
770 return NULL;
771 else
772 ch->data = cp;
773
774 cp = &ch->data[ch->dlength];
775
776 /*
777 * Note this is a bit cryptic, but it works.
778 * When appending to an existing text chunk we write a preceeding nul
779 * to mark the end of the previous value (we rewound above specifically
780 * for this case).
781 * When creating a new chunk we still write a nul, but in this case it's
782 * the RAW format byte. After the value we add an extra nul to
783 * indicate the last entry.
784 */
785 ch->dlength += 1+sprintf(cp, "%c%s%c%s%c", 0, key, 0, value, 0);
786
787 return ch;
788 }
789
790
791
792 /*
793 * Stores held ztr huffman_codes as ZTR chunks.
794 * Returns 0 for success
795 * -1 for failure
796 */
ztr_store_hcodes(ztr_t * ztr)797 int ztr_store_hcodes(ztr_t *ztr) {
798 int i;
799 ztr_chunk_t *chunks;
800 int nchunks;
801
802 if (ztr->nhcodes == 0)
803 return 0;
804
805 /* Extend chunks array */
806 nchunks = ztr->nchunks + ztr->nhcodes;
807 chunks = (ztr_chunk_t *)realloc(ztr->chunk, nchunks * sizeof(*chunks));
808 if (!chunks)
809 return -1;
810 ztr->chunk = chunks;
811
812 /* Encode */
813 for (i = 0; i < ztr->nhcodes; i++) {
814 block_t *blk = block_create(NULL, 2);
815 int j = ztr->nchunks;
816 unsigned char bytes[2];
817
818 ztr->chunk[j].type = ZTR_TYPE_HUFF;
819 ztr->chunk[j].mdata = 0;
820 ztr->chunk[j].mdlength = 0;
821 ztr->chunk[j].ztr_owns = 1;
822 bytes[0] = 0;
823 bytes[1] = ztr->hcodes[i].codes->code_set;
824 store_bytes(blk, bytes, 2);
825 /* FIXME: Now already cached in ztr_hcode_t */
826 if (0 == store_codes(blk, ztr->hcodes[i].codes, 1)) {
827 /* Last byte is always merged with first of stream */
828 if (blk->bit == 0) {
829 unsigned char zero = 0;
830 store_bytes(blk, &zero, 1);
831 }
832
833 ztr->chunk[j].data = (char *)blk->data;
834 ztr->chunk[j].dlength = blk->byte + (blk->bit != 0);
835 block_destroy(blk, 1);
836 ztr->nchunks++;
837 }
838 }
839
840 return ztr->nchunks == nchunks ? 0 : -1;
841 }
842
843 /*
844 * Given a ZTR chunk this searches through the meta-data key/value pairings
845 * to return the corresponding value.
846 *
847 * Returns a pointer into the mdata on success (nul-terminated)
848 * NULL on failure.
849 */
ztr_lookup_mdata_value(ztr_t * z,ztr_chunk_t * chunk,char * key)850 char *ztr_lookup_mdata_value(ztr_t *z, ztr_chunk_t *chunk, char *key) {
851 if (z->header.version_major > 1 ||
852 z->header.version_minor >= 2) {
853 /* ZTR format 1.2 onwards */
854 char *cp = chunk->mdata;
855 int32_t dlen = chunk->mdlength;
856
857 /*
858 * NB: we may wish to rewrite this using a dedicated state machine
859 * instead of strlen/strcmp as this currently assumes the meta-
860 * data is correctly formatted, which we cannot assume as the
861 * metadata is external and outside of our control.
862 * Passing in non-nul terminated strings could crash this code.
863 */
864 while (dlen > 0) {
865 size_t l;
866 int found;
867
868 /* key */
869 l = strlen(cp);
870 found = strcmp(cp, key) == 0;
871 cp += l+1;
872 dlen -= l+1;
873
874 /* value */
875 if (found)
876 return cp;
877 l = strlen(cp);
878 cp += l+1;
879 dlen -= l+1;
880 }
881 return NULL;
882
883 } else {
884 /* v1.1 and before only supported a few types, specifically coded
885 * per chunk type.
886 */
887 switch (chunk->type) {
888 case ZTR_TYPE_SAMP:
889 case ZTR_TYPE_SMP4:
890 if (strcmp(key, "TYPE"))
891 return chunk->mdata;
892 break;
893
894 default:
895 break;
896 }
897 }
898
899 return NULL;
900 }
901
902 /*
903 * Compresses an individual chunk using a specific format. The format is one
904 * of the 'format' fields listed in the spec; one of the ZTR_FORM_ macros.
905 */
compress_chunk(ztr_t * ztr,ztr_chunk_t * chunk,int format,int option,int option2)906 int compress_chunk(ztr_t *ztr, ztr_chunk_t *chunk, int format,
907 int option, int option2) {
908 char *new_data = NULL;
909 int new_len;
910
911 switch (format) {
912 case ZTR_FORM_RAW:
913 return 0;
914
915 case ZTR_FORM_RLE:
916 new_data = rle(chunk->data, chunk->dlength, option, &new_len);
917 if (entropy((unsigned char *)new_data, new_len) >=
918 entropy((unsigned char *)chunk->data, chunk->dlength)) {
919 xfree(new_data);
920 return 0;
921 }
922 break;
923
924 case ZTR_FORM_XRLE:
925 new_data = xrle(chunk->data, chunk->dlength, option,option2, &new_len);
926 break;
927
928 case ZTR_FORM_XRLE2:
929 new_data = xrle2(chunk->data, chunk->dlength, option, &new_len);
930 break;
931
932 case ZTR_FORM_ZLIB:
933 new_data = zlib_huff(chunk->data, chunk->dlength, option, &new_len);
934 break;
935
936 case ZTR_FORM_DELTA1:
937 new_data = decorrelate1(chunk->data, chunk->dlength, option, &new_len);
938 break;
939
940 case ZTR_FORM_DDELTA1:
941 new_data = decorrelate1dyn(chunk->data, chunk->dlength, &new_len);
942 break;
943
944 case ZTR_FORM_DELTA2:
945 new_data = decorrelate2(chunk->data, chunk->dlength, option, &new_len);
946 break;
947
948 case ZTR_FORM_DDELTA2:
949 new_data = decorrelate2dyn(chunk->data, chunk->dlength, &new_len);
950 break;
951
952 case ZTR_FORM_DELTA4:
953 new_data = decorrelate4(chunk->data, chunk->dlength, option, &new_len);
954 break;
955
956 case ZTR_FORM_16TO8:
957 new_data = shrink_16to8(chunk->data, chunk->dlength, &new_len);
958 break;
959
960 case ZTR_FORM_32TO8:
961 new_data = shrink_32to8(chunk->data, chunk->dlength, &new_len);
962 break;
963
964 case ZTR_FORM_FOLLOW1:
965 new_data = follow1(chunk->data, chunk->dlength, &new_len);
966 break;
967
968 case ZTR_FORM_ICHEB:
969 new_data = ichebcomp(chunk->data, chunk->dlength, &new_len);
970 break;
971
972 case ZTR_FORM_LOG2:
973 new_data = log2_data(chunk->data, chunk->dlength, &new_len);
974 break;
975
976 case ZTR_FORM_STHUFF:
977 new_data = sthuff(ztr, chunk->data, chunk->dlength,
978 option, option2, &new_len);
979 break;
980
981 case ZTR_FORM_QSHIFT:
982 new_data = qshift(chunk->data, chunk->dlength, &new_len);
983 break;
984
985 case ZTR_FORM_TSHIFT:
986 new_data = tshift(ztr, chunk->data, chunk->dlength, &new_len);
987 break;
988 }
989
990 if (!new_data) {
991 fprintf(stderr, "!!ERROR!!\n");
992 return -1;
993 }
994
995 /*
996 fprintf(stderr, "Format %d => %d to %d\n", format, chunk->dlength, new_len);
997 */
998
999 chunk->dlength = new_len;
1000 xfree(chunk->data);
1001 chunk->data = new_data;
1002
1003 return 0;
1004 }
1005
1006 /*
1007 * Uncompresses an individual chunk from all levels of compression.
1008 */
uncompress_chunk(ztr_t * ztr,ztr_chunk_t * chunk)1009 int uncompress_chunk(ztr_t *ztr, ztr_chunk_t *chunk) {
1010 char *new_data = NULL;
1011 int new_len;
1012
1013 while (chunk->dlength > 0 && chunk->data[0] != ZTR_FORM_RAW) {
1014 switch (chunk->data[0]) {
1015 case ZTR_FORM_RLE:
1016 new_data = unrle(chunk->data, chunk->dlength, &new_len);
1017 break;
1018
1019 case ZTR_FORM_XRLE:
1020 new_data = unxrle(chunk->data, chunk->dlength, &new_len);
1021 break;
1022
1023 case ZTR_FORM_XRLE2:
1024 new_data = unxrle2(chunk->data, chunk->dlength, &new_len);
1025 break;
1026
1027 case ZTR_FORM_ZLIB:
1028 new_data = zlib_dehuff(chunk->data, chunk->dlength, &new_len);
1029 break;
1030
1031 case ZTR_FORM_DELTA1:
1032 new_data = recorrelate1(chunk->data, chunk->dlength, &new_len);
1033 break;
1034
1035 case ZTR_FORM_DELTA2:
1036 new_data = recorrelate2(chunk->data, chunk->dlength, &new_len);
1037 break;
1038
1039 case ZTR_FORM_DELTA4:
1040 new_data = recorrelate4(chunk->data, chunk->dlength, &new_len);
1041 break;
1042
1043 case ZTR_FORM_16TO8:
1044 new_data = expand_8to16(chunk->data, chunk->dlength, &new_len);
1045 break;
1046
1047 case ZTR_FORM_32TO8:
1048 new_data = expand_8to32(chunk->data, chunk->dlength, &new_len);
1049 break;
1050
1051 case ZTR_FORM_FOLLOW1:
1052 new_data = unfollow1(chunk->data, chunk->dlength, &new_len);
1053 break;
1054
1055 case ZTR_FORM_ICHEB:
1056 new_data = ichebuncomp(chunk->data, chunk->dlength, &new_len);
1057 break;
1058
1059 case ZTR_FORM_LOG2:
1060 new_data = unlog2_data(chunk->data, chunk->dlength, &new_len);
1061 break;
1062
1063 case ZTR_FORM_STHUFF:
1064 new_data = unsthuff(ztr, chunk->data, chunk->dlength, &new_len);
1065 break;
1066
1067 case ZTR_FORM_QSHIFT:
1068 new_data = unqshift(chunk->data, chunk->dlength, &new_len);
1069 break;
1070
1071 case ZTR_FORM_TSHIFT:
1072 new_data = untshift(ztr, chunk->data, chunk->dlength, &new_len);
1073 break;
1074
1075 default:
1076 fprintf(stderr, "Unknown encoding format %d\n", chunk->data[0]);
1077 return -1;
1078 }
1079
1080 if (!new_data)
1081 return -1;
1082
1083 /*
1084 fprintf(stderr, "format %d => %d to %d\n",
1085 chunk->data[0], chunk->dlength, new_len);
1086 */
1087
1088 chunk->dlength = new_len;
1089 xfree(chunk->data);
1090 chunk->data = new_data;
1091 }
1092
1093 return 0;
1094 }
1095
1096 /*
1097 * Compresses a ztr (in memory).
1098 * Level is 0, 1, 2 or 3 (no compression, delta, delta + zlib,
1099 * chebyshev + zlib).
1100 */
compress_ztr(ztr_t * ztr,int level)1101 int compress_ztr(ztr_t *ztr, int level) {
1102 int i;
1103
1104 if (0 == level)
1105 return 0;
1106
1107 for (i = 0; i < ztr->nchunks; i++) {
1108 /*
1109 {
1110 char str[5];
1111 fprintf(stderr, "---- %.4s ----\n",
1112 ZTR_BE2STR(ztr->chunk[i].type,str));
1113 }
1114 fprintf(stderr, "Uncomp length=%d\n", ztr->chunk[i].dlength);
1115 */
1116
1117 switch(ztr->chunk[i].type) {
1118 char *type;
1119 case ZTR_TYPE_SAMP:
1120 case ZTR_TYPE_SMP4:
1121 #ifdef ILLUMINA_GA
1122 compress_chunk(ztr, &ztr->chunk[i],
1123 ZTR_FORM_STHUFF, CODE_TRACES, 0);
1124 #else
1125 type = ztr_lookup_mdata_value(ztr, &ztr->chunk[i], "TYPE");
1126 if (type && 0 == strcmp(type, "PYRW")) {
1127 /* Raw data is not really compressable */
1128 } else if (type && 0 == strcmp(type, "PYNO")) {
1129 if (level > 1) {
1130 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_16TO8, 0, 0);
1131 compress_chunk(ztr, &ztr->chunk[i],
1132 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0);
1133 }
1134 } else {
1135 if (level <= 2) {
1136 /*
1137 * Experiments show that typically a double delta does
1138 * better than a single delta for 8-bit data, and the other
1139 * way around for 16-bit data
1140 */
1141 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_DELTA2,
1142 ztr->delta_level, 0);
1143 } else {
1144 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_ICHEB, 0, 0);
1145 }
1146
1147 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_16TO8, 0, 0);
1148 if (level > 1) {
1149 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_FOLLOW1,0, 0);
1150 /*
1151 compress_chunk(ztr, &ztr->chunk[i],
1152 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY);
1153 */
1154 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_RLE, 150, 0);
1155 compress_chunk(ztr, &ztr->chunk[i],
1156 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0);
1157 }
1158 }
1159 #endif
1160 break;
1161
1162 case ZTR_TYPE_BASE:
1163 #ifdef ILLUMINA_GA
1164 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_STHUFF, CODE_DNA, 0);
1165 #else
1166 if (level > 1) {
1167 compress_chunk(ztr, &ztr->chunk[i],
1168 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0);
1169 }
1170 #endif
1171 break;
1172
1173 case ZTR_TYPE_CNF1:
1174 case ZTR_TYPE_CNF4:
1175 case ZTR_TYPE_CSID:
1176 #ifdef ILLUMINA_GA
1177 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_RLE, 77, 0);
1178 compress_chunk(ztr, &ztr->chunk[i],
1179 ZTR_FORM_STHUFF, CODE_CONF_RLE, 0);
1180 #else
1181 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_DELTA1, 1, 0);
1182 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_RLE, 77, 0);
1183 if (level > 1) {
1184 compress_chunk(ztr, &ztr->chunk[i],
1185 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0);
1186 }
1187 #endif
1188 break;
1189
1190 case ZTR_TYPE_BPOS:
1191 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_DELTA4, 1, 0);
1192 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_32TO8, 0, 0);
1193 if (level > 1) {
1194 compress_chunk(ztr, &ztr->chunk[i],
1195 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0);
1196 }
1197 break;
1198
1199 case ZTR_TYPE_TEXT:
1200 #ifdef ILLUMINA_GA
1201 #else
1202 if (level > 1) {
1203 compress_chunk(ztr, &ztr->chunk[i],
1204 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0);
1205 }
1206 #endif
1207 break;
1208
1209 case ZTR_TYPE_FLWO:
1210 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_XRLE, 0, 4);
1211 break;
1212
1213 }
1214
1215 /*
1216 fprintf(stderr, "Comp length=%d\n", ztr->chunk[i].dlength);
1217 */
1218 }
1219
1220 return 0;
1221 }
1222
1223 /*
1224 * Uncompresses a ztr (in memory).
1225 */
uncompress_ztr(ztr_t * ztr)1226 int uncompress_ztr(ztr_t *ztr) {
1227 int i;
1228
1229 for (i = 0; i < ztr->nchunks; i++) {
1230 /*
1231 {
1232 char str[5];
1233 fprintf(stderr, "---- %.4s ----\n",
1234 ZTR_BE2STR(ztr->chunk[i].type,str));
1235 }
1236 */
1237 uncompress_chunk(ztr, &ztr->chunk[i]);
1238 }
1239
1240 return 0;
1241 }
1242