1 /*
2  * Copyright (c) 2005-2010, 2013 Genome Research Ltd.
3  * Author(s): James Bonfield
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *    Institute nor the names of its contributors may be used to endorse
18  *    or promote products derived from this software without specific
19  *    prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  * Author(s): James Bonfield
36  *
37  * Copyright (c) 2001-2002 MEDICAL RESEARCH COUNCIL
38  * All rights reserved
39  *
40  * Redistribution and use in source and binary forms, with or without
41  * modification, are permitted provided that the following conditions are met:
42  *
43  *    1 Redistributions of source code must retain the above copyright notice,
44  *      this list of conditions and the following disclaimer.
45  *
46  *    2 Redistributions in binary form must reproduce the above copyright
47  *      notice, this list of conditions and the following disclaimer in
48  *      the documentation and/or other materials provided with the
49  *      distribution.
50  *
51  *    3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
52  *      MOLECULAR BIOLOGY nor the names of its contributors may be used
53  *      to endorse or promote products derived from this software without
54  *      specific prior written permission.
55  *
56  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
57  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
60  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
61  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
62  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
63  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
64  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
65  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
66  * POSSIBILITY OF SUCH DAMAGE.
67  */
68 
69 #ifdef HAVE_CONFIG_H
70 #include "io_lib_config.h"
71 #endif
72 
73 #include <stdio.h>
74 #include <string.h>
75 #include <math.h>
76 
77 /* #include <fcntl.h> */
78 
79 #include "io_lib/ztr.h"
80 #include "io_lib/xalloc.h"
81 #include "io_lib/Read.h"
82 #include "io_lib/compression.h"
83 #include "io_lib/stdio_hack.h"
84 #include "io_lib/deflate_interlaced.h"
85 
86 /* Deprecated #define - see solexa2srf for the more up to date code */
87 /* #define ILLUMINA_GA */
88 
89 /*
90  * ---------------------------------------------------------------------------
91  * Trace writing functions.
92  * These consist of several encoding functions, all with the same prototype,
93  * and a single fwrite_ztr function to wrap it all up.
94  * ---------------------------------------------------------------------------
95  */
96 
97 /*
98  * ztr_write_header
99  *
100  * Writes a ZTR file header.
101  *
102  * Arguments:
103  * 	fp		A FILE pointer
104  *	h		A pointer to the header to write
105  *
106  * Returns:
107  *	Success:  0
108  *	Failure: -1
109  */
ztr_write_header(FILE * fp,ztr_header_t * h)110 static int ztr_write_header(FILE *fp, ztr_header_t *h) {
111     if (1 != fwrite(h, sizeof(*h), 1, fp))
112 	return -1;
113 
114     return 0;
115 }
116 
117 /*
118  * ztr_write_chunk
119  *
120  * Writes a ZTR chunk including chunk header and data
121  *
122  * Arguments:
123  * 	fp		A FILE pointer
124  *	chunk		A pointer to the chunk to write
125  *
126  * Returns:
127  *	Success:  0
128  *	Failure: -1
129  */
ztr_write_chunk(FILE * fp,ztr_chunk_t * chunk)130 static int ztr_write_chunk(FILE *fp, ztr_chunk_t *chunk) {
131     int4 bei4;
132 
133     /*
134     {
135 	char str[5];
136 	fprintf(stderr, "Write chunk %.4s %08x length %d\n",
137 		ZTR_BE2STR(chunk->type, str), chunk->type, chunk->dlength);
138     }
139     */
140 
141     /* type */
142     bei4 = be_int4(chunk->type);
143     if (1 != fwrite(&bei4, 4, 1, fp))
144 	return -1;
145 
146     /* metadata length */
147     bei4 = be_int4(chunk->mdlength);
148     if (1 != fwrite(&bei4, 4, 1, fp))
149 	return -1;
150 
151     /* metadata */
152     if (chunk->mdlength)
153 	if (chunk->mdlength != fwrite(chunk->mdata, 1, chunk->mdlength, fp))
154 	    return -1;
155 
156     /* data length */
157     bei4 = be_int4(chunk->dlength);
158     if (1 != fwrite(&bei4, 4, 1, fp))
159 	return -1;
160 
161     /* data */
162     if (chunk->dlength != fwrite(chunk->data, 1, chunk->dlength, fp))
163 	return -1;
164 
165     return 0;
166 }
167 
168 /*
169  * fwrite_ztr
170  *
171  * Writes a ZTR file held in the ztr_t structure.
172  * It is assumed that all the correct lengths, magic numbers, etc in the
173  * ztr_t struct have already been initialised correctly.
174  *
175  * FIXME: Add a 'method' argument which encodes formats? Perhaps store this
176  * in the ztr struct?
177  *
178  * Arguments:
179  *	fp		A writable FILE pointer
180  *	ztr		A pointer to the ztr_t struct to write.
181  *
182  * Returns:
183  *	Success:  0
184  *	Failure: -1
185  */
fwrite_ztr(FILE * fp,ztr_t * ztr)186 int fwrite_ztr(FILE *fp, ztr_t *ztr) {
187     int i;
188 
189     /* Write the header record */
190     if (-1 == ztr_write_header(fp, &ztr->header))
191 	return -1;
192 
193     /* Write the chunks */
194     for (i = 0; i < ztr->nchunks; i++) {
195 	if (-1 == ztr_write_chunk(fp, &ztr->chunk[i]))
196 	    return -1;
197 #if 0
198 	{
199 	    int fd;
200 	    char fname[1024];
201 	    sprintf(fname, "chunk.%d", i);
202 	    fd = open(fname, O_RDWR|O_CREAT|O_TRUNC, 0666);
203 	    write(fd, ztr->chunk[i].data, ztr->chunk[i].dlength);
204 	    close(fd);
205 	}
206 #endif
207     }
208 
209     return 0;
210 }
211 
212 /*
213  * ---------------------------------------------------------------------------
214  * Trace reading functions.
215  * These consist of several decoding functions, all with the same prototype,
216  * and a single fread_ztr function to wrap it all up.
217  * ---------------------------------------------------------------------------
218  */
219 
220 /*
221  * ztr_read_header
222  *
223  * Reads a ZTR file header.
224  *
225  * Arguments:
226  * 	fp		A FILE pointer
227  *	h		Where to write the header to
228  *
229  * Returns:
230  *	Success:  0
231  *	Failure: -1
232  */
ztr_read_header(FILE * fp,ztr_header_t * h)233 int ztr_read_header(FILE *fp, ztr_header_t *h) {
234     if (1 != fread(h, sizeof(*h), 1, fp))
235 	return -1;
236 
237     return 0;
238 }
239 
240 /*
241  * ztr_read_chunk_hdr
242  *
243  * Reads a ZTR chunk header and metadata, but not the main data segment.
244  *
245  * Arguments:
246  * 	fp		A FILE pointer
247  *
248  * Returns:
249  *	Success: a chunk pointer (malloced)
250  *	Failure: NULL
251  */
ztr_read_chunk_hdr(FILE * fp)252 ztr_chunk_t *ztr_read_chunk_hdr(FILE *fp) {
253     int4 bei4;
254     ztr_chunk_t *chunk;
255 
256     if (NULL == (chunk = (ztr_chunk_t *)xmalloc(sizeof(*chunk))))
257 	return NULL;
258 
259     /* type */
260     if (1 != fread(&bei4, 4, 1, fp)) {
261 	xfree(chunk);
262 	return NULL;
263     }
264     chunk->type = be_int4(bei4);
265 
266     /* metadata length */
267     if (1 != fread(&bei4, 4, 1, fp)) {
268 	xfree(chunk);
269 	return NULL;
270     }
271     chunk->mdlength = be_int4(bei4);
272 
273     /* metadata */
274     chunk->ztr_owns = 1;
275     if (chunk->mdlength) {
276 	if (NULL == (chunk->mdata = (char *)xmalloc(chunk->mdlength))) {
277 	    xfree(chunk);
278 	    return NULL;
279 	}
280 	if (chunk->mdlength != fread(chunk->mdata, 1, chunk->mdlength, fp)) {
281 	    xfree(chunk->mdata);
282 	    xfree(chunk);
283 	    return NULL;
284 	}
285     } else {
286 	chunk->mdata = NULL;
287     }
288 
289     /* data length */
290     if (1 != fread(&bei4, 4, 1, fp)) {
291 	if (chunk->mdata)
292 	    xfree(chunk->mdata);
293 	xfree(chunk);
294 	return NULL;
295     }
296     chunk->dlength = be_int4(bei4);
297 
298     return chunk;
299 }
300 
ztr_process_text(ztr_t * ztr)301 void ztr_process_text(ztr_t *ztr) {
302     int i;
303     ztr_chunk_t **text_chunks = NULL;
304     int ntext_chunks = 0;
305     ztr_text_t *zt = NULL;
306     int nzt = 0;
307     int nalloc = 0;
308 
309     if (ztr->text_segments)
310 	/* Already done */
311 	return;
312 
313     text_chunks = ztr_find_chunks(ztr, ZTR_TYPE_TEXT, &ntext_chunks);
314     if (!text_chunks)
315 	return;
316 
317     for (i = 0; i < ntext_chunks; i++) {
318 	char *data;
319 	uint4 length;
320 	char *ident, *value;
321 
322 	/* Make sure it's not compressed */
323 	uncompress_chunk(ztr, text_chunks[i]);
324 
325 	data = text_chunks[i]->data;
326 	length = text_chunks[i]->dlength;
327 
328 	if (!length)
329 	    continue;
330 
331 	/* Skip RAW header byte */
332 	data++;
333 	length--;
334 
335 	while (data - text_chunks[i]->data <= (ptrdiff_t)length &&
336 	       *(ident = data)) {
337 	    data += strlen(ident)+1;
338 	    value = data;
339 	    if (value)
340 		data += strlen(value)+1;
341 
342 	    if (nzt + 1 > nalloc) {
343 		nalloc += 10;
344 		zt = (ztr_text_t *)xrealloc(zt, nalloc * sizeof(*zt));
345 	    }
346 	    zt[nzt].ident = ident;
347 	    zt[nzt].value = value;
348 	    nzt++;
349 	}
350     }
351 
352     ztr->text_segments = zt;
353     ztr->ntext_segments = nzt;
354 
355     /*
356     for (i = 0; i < ztr->ntext_segments; i++) {
357 	fprintf(stderr, "'%s' = '%s'\n",
358 		ztr->text_segments[i].ident,
359 		ztr->text_segments[i].value);
360     }
361     */
362 
363     xfree(text_chunks);
364 }
365 
366 
367 /*
368  * fread_ztr
369  *
370  * Reads a ZTR file from 'fp'. This checks for the correct magic number and
371  * major version number, but not minor version number.
372  *
373  * FIXME: Add automatic uncompression?
374  *
375  * Arguments:
376  *	fp		A readable FILE pointer
377  *
378  * Returns:
379  *	Success: Pointer to a ztr_t structure (malloced)
380  *	Failure: NULL
381  */
fread_ztr(FILE * fp)382 ztr_t *fread_ztr(FILE *fp) {
383     ztr_t *ztr;
384     ztr_chunk_t *chunk;
385     int sections = read_sections(0);
386 
387     /* Allocate */
388     if (NULL == (ztr = new_ztr()))
389 	return NULL;
390 
391     /* Read the header */
392     if (-1 == ztr_read_header(fp, &ztr->header))
393 	return NULL;
394 
395     /* Check magic number and version */
396     if (memcmp(ztr->header.magic, ZTR_MAGIC, 8) != 0)
397 	return NULL;
398 
399     if (ztr->header.version_major != ZTR_VERSION_MAJOR)
400 	return NULL;
401 
402     /* Load chunks */
403     while ((chunk = ztr_read_chunk_hdr(fp))) {
404 	/*
405 	char str[5];
406 
407 	fprintf(stderr, "Read chunk %.4s %08x length %d\n",
408 		ZTR_BE2STR(chunk->type, str), chunk->type, chunk->dlength);
409 	*/
410 	switch(chunk->type) {
411 	case ZTR_TYPE_HEADER:
412 	    /* End of file */
413 	    return ztr;
414 
415 	case ZTR_TYPE_SAMP:
416 	case ZTR_TYPE_SMP4:
417 	    if (! (sections & READ_SAMPLES)) {
418 		fseek(fp, chunk->dlength, SEEK_CUR);
419 		xfree(chunk);
420 		continue;
421 	    }
422 	    break;
423 
424 
425 	case ZTR_TYPE_BASE:
426 	case ZTR_TYPE_BPOS:
427 	case ZTR_TYPE_CNF4:
428 	case ZTR_TYPE_CNF1:
429 	case ZTR_TYPE_CSID:
430 	    if (! (sections & READ_BASES)) {
431 		fseek(fp, chunk->dlength, SEEK_CUR);
432 		xfree(chunk);
433 		continue;
434 	    }
435 	    break;
436 
437 	case ZTR_TYPE_TEXT:
438 	    if (! (sections & READ_COMMENTS)) {
439 		fseek(fp, chunk->dlength, SEEK_CUR);
440 		xfree(chunk);
441 		continue;
442 	    }
443 	    break;
444 
445 	case ZTR_TYPE_CLIP:
446 	case ZTR_TYPE_FLWO:
447 	case ZTR_TYPE_FLWC:
448 	    break;
449 
450 	    /*
451 	default:
452 	    fprintf(stderr, "Unknown chunk type '%s': skipping\n",
453 		    ZTR_BE2STR(chunk->type,str));
454 	    fseek(fp, chunk->dlength, SEEK_CUR);
455 	    xfree(chunk);
456 	    continue;
457 	    */
458 	}
459 
460 	chunk->ztr_owns = 1;
461 	chunk->data = (char *)xmalloc(chunk->dlength);
462 	if (chunk->dlength != fread(chunk->data, 1, chunk->dlength, fp)) {
463 	    delete_ztr(ztr);
464 	    return NULL;
465 	}
466 
467 	ztr->nchunks++;
468 	ztr->chunk = (ztr_chunk_t *)xrealloc(ztr->chunk, ztr->nchunks *
469 					     sizeof(ztr_chunk_t));
470 	memcpy(&ztr->chunk[ztr->nchunks-1], chunk, sizeof(*chunk));
471 	xfree(chunk);
472     }
473 
474     return ztr;
475 }
476 
477 /*
478  * ---------------------------------------------------------------------------
479  * Other utility functions
480  * ---------------------------------------------------------------------------
481  */
482 /*
483  * new_ztr
484  *
485  * Allocates and initialises a ztr_t structure
486  *
487  * Returns:
488  *	ztr_t pointer on success
489  *	NULL on failure
490  */
new_ztr(void)491 ztr_t *new_ztr(void) {
492     ztr_t *ztr;
493 
494     /* Allocate */
495     if (NULL == (ztr = (ztr_t *)xmalloc(sizeof(*ztr))))
496 	return NULL;
497 
498     ztr->chunk = NULL;
499     ztr->nchunks = 0;
500     ztr->text_segments = NULL;
501     ztr->ntext_segments = 0;
502     ztr->delta_level = 3;
503 
504     ztr->nhcodes = 0;
505     ztr->hcodes = NULL;
506     ztr->hcodes_checked = 0;
507 
508     return ztr;
509 }
510 
delete_ztr(ztr_t * ztr)511 void delete_ztr(ztr_t *ztr) {
512     int i;
513 
514     if (!ztr)
515 	return;
516 
517     if (ztr->chunk) {
518 	for (i = 0; i < ztr->nchunks; i++) {
519 	    if (ztr->chunk[i].data && ztr->chunk[i].ztr_owns)
520 		xfree(ztr->chunk[i].data);
521 	    if (ztr->chunk[i].mdata && ztr->chunk[i].ztr_owns)
522 		xfree(ztr->chunk[i].mdata);
523 	}
524 	xfree(ztr->chunk);
525     }
526 
527     if (ztr->hcodes) {
528 	for (i = 0; i < ztr->nhcodes; i++) {
529 	    if (ztr->hcodes[i].codes && ztr->hcodes[i].ztr_owns)
530 		huffman_codeset_destroy(ztr->hcodes[i].codes);
531 	}
532 	free(ztr->hcodes);
533     }
534 
535     if (ztr->text_segments)
536 	xfree(ztr->text_segments);
537 
538     xfree(ztr);
539 }
540 
541 /*
542  * ztr_find_chunks
543  *
544  * Searches for chunks of a specific type.
545  *
546  * Returns:
547  *	Array of ztr_chunk_t pointers (into the ztr struct). This is
548  *	  allocated by malloc and it is the callers duty to free this.
549  *	NULL if none found.
550  */
ztr_find_chunks(ztr_t * ztr,uint4 type,int * nchunks_p)551 ztr_chunk_t **ztr_find_chunks(ztr_t *ztr, uint4 type, int *nchunks_p) {
552     ztr_chunk_t **chunks = NULL;
553     int nchunks = 0;
554     int i;
555 
556     for (i = 0; i < ztr->nchunks; i++) {
557 	if (ztr->chunk[i].type == type) {
558 	    chunks = (ztr_chunk_t **)xrealloc(chunks, (nchunks + 1) *
559 					      sizeof(*chunks));
560 	    chunks[nchunks++] = &ztr->chunk[i];
561 	}
562     }
563     *nchunks_p = nchunks;
564     return chunks;
565 }
566 
567 /*
568  * Shannon showed that for storage in base 'b' with alphabet symbols 'a' having
569  * a probability of ocurring in any context of 'Pa' we should encode
570  * symbol 'a' to have a storage width of -logb(Pa).
571  *
572  * Eg. b = 26, P(e) = .22. => width .4647277.
573  *
574  * We use this to calculate the entropy of a signal by summing over all letters
575  * in the signal. In this case, our storage has base 256.
576  */
577 #define EBASE 256
entropy(unsigned char * data,int len)578 static double entropy(unsigned char *data, int len) {
579     double E[EBASE];
580     double P[EBASE];
581     double e;
582     int i;
583 
584     for (i = 0; i < EBASE; i++)
585         P[i] = 0;
586 
587     for (i = 0; i < len; i++)
588         P[data[i]]++;
589 
590     for (i = 0; i < EBASE; i++) {
591         if (P[i]) {
592             P[i] /= len;
593             E[i] = -(log(P[i])/log(EBASE));
594         } else {
595             E[i] = 0;
596         }
597     }
598 
599     for (e = i = 0; i < len; i++)
600         e += E[data[i]];
601 
602     return e;
603 }
604 
605 /*
606  * Adds a user-defined huffman_codeset_t code-set to the available code sets
607  * used by huffman_encode or huffman_decode.
608  *
609  * Note that the 'codes' memory is then "owned" by the ztr object if "ztr_owns"
610  * is true and will be deallocated when the ztr object is destroyed. Otherwise
611  * freeing the ztr object will not touch the passed in codes.
612  */
ztr_add_hcode(ztr_t * ztr,huffman_codeset_t * codes,int ztr_owns)613 ztr_hcode_t *ztr_add_hcode(ztr_t *ztr, huffman_codeset_t *codes,
614 			   int ztr_owns) {
615     if (!codes)
616 	return NULL;
617 
618     ztr->hcodes = realloc(ztr->hcodes, (ztr->nhcodes+1)*sizeof(*ztr->hcodes));
619     ztr->hcodes[ztr->nhcodes].codes = codes;
620     ztr->hcodes[ztr->nhcodes].ztr_owns = ztr_owns;
621 
622     return &ztr->hcodes[ztr->nhcodes++];
623 }
624 
625 /*
626  * Searches through the cached huffman_codeset_t tables looking for a stored
627  * huffman code of type 'code_set'.
628  * NB: only code_sets >= CODE_USER will be stored here.
629  *
630  * Returns codes on success,
631  *         NULL on failure
632  */
ztr_find_hcode(ztr_t * ztr,int code_set)633 ztr_hcode_t *ztr_find_hcode(ztr_t *ztr, int code_set) {
634     int i;
635 
636     if (code_set < CODE_USER)
637 	return NULL; /* computed on-the-fly or use a hard-coded set */
638 
639     /* Check through chunks for undecoded HUFF chunks */
640     if (!ztr->hcodes_checked) {
641 	for (i = 0; i < ztr->nchunks; i++) {
642 	    if (ztr->chunk[i].type == ZTR_TYPE_HUFF) {
643 		block_t *blk;
644 		huffman_codeset_t *cs;
645 		uncompress_chunk(ztr, &ztr->chunk[i]);
646 		blk = block_create((unsigned char *)(ztr->chunk[i].data+2),
647 				   ztr->chunk[i].dlength-2);
648 		cs = restore_codes(blk, NULL);
649 		if (!cs) {
650 		    block_destroy(blk, 1);
651 		    return NULL;
652 		}
653 		cs->code_set = (unsigned char)(ztr->chunk[i].data[1]);
654 		ztr_add_hcode(ztr, cs, 1);
655 		block_destroy(blk, 1);
656 	    }
657 	}
658 	ztr->hcodes_checked = 1;
659     }
660 
661     /* Check cached copies */
662     for (i = 0; i < ztr->nhcodes; i++) {
663 	if (ztr->hcodes[i].codes->code_set == code_set)
664 	    return &ztr->hcodes[i];
665     }
666 
667     return NULL;
668 }
669 
ztr_find_hcode_chunk(ztr_t * ztr,int code_set)670 ztr_chunk_t *ztr_find_hcode_chunk(ztr_t *ztr, int code_set) {
671     int i;
672 
673     if (code_set < CODE_USER)
674 	return NULL; /* computed on-the-fly or use a hard-coded set */
675 
676     /* Check through chunks for undecoded HUFF chunks */
677     for (i = 0; i < ztr->nchunks; i++) {
678 	if (ztr->chunk[i].type == ZTR_TYPE_HUFF) {
679 	    uncompress_chunk(ztr, &ztr->chunk[i]);
680 	    if (ztr->chunk[i].dlength >= 2 &&
681 		(unsigned char)ztr->chunk[i].data[1] == code_set)
682 		return &ztr->chunk[i];
683 	}
684     }
685 
686     return NULL;
687 }
688 
689 /*
690  * Adds a new chunk to a ztr file and returns the chunk pointer.
691  * The data and mdata fields can be NULL and the chunk will not be
692  * initialised.
693  *
694  * Returns new chunk ptr on success.
695  *         NULL on failure.
696  */
ztr_new_chunk(ztr_t * ztr,uint4 type,char * data,uint4 dlength,char * mdata,uint4 mdlength)697 ztr_chunk_t *ztr_new_chunk(ztr_t *ztr, uint4 type,
698 			   char *data,  uint4 dlength,
699 			   char *mdata, uint4 mdlength) {
700     ztr_chunk_t *chunks, *c;
701 
702     /* Grow the chunk array */
703     chunks = (ztr_chunk_t *)realloc(ztr->chunk,
704 				    (ztr->nchunks+1) * sizeof(*chunks));
705     if (!chunks)
706 	return NULL;
707     ztr->chunk = chunks;
708 
709     /* Initialise */
710     c = &ztr->chunk[ztr->nchunks++];
711     c->type     = type;
712     c->data     = data;
713     c->dlength  = dlength;
714     c->mdata    = mdata;
715     c->mdlength = mdlength;
716     c->ztr_owns = 1;
717 
718     return c;
719 }
720 
721 /*
722  * Adds a key/value pair to a ztr TEXT chunk.
723  * The 'ch' chunk may be explicitly specified in which case the text
724  * is added to that chunk or it may be specified as NULL in which case
725  * the key/value pair are added to the first available TEXT chunk,
726  * possibly creating a new one if required.
727  *
728  * NOTE: If the key already exists in the text chunk this appends a new
729  * copy; it does not overwrite the old one.
730  *
731  * Returns ztr text chunk ptr for success
732  *        NULL for failure
733  */
ztr_add_text(ztr_t * z,ztr_chunk_t * ch,const char * key,const char * value)734 ztr_chunk_t *ztr_add_text(ztr_t *z, ztr_chunk_t *ch,
735 			  const char *key, const char *value) {
736     ztr_chunk_t **text_chunks = NULL;
737     int ntext_chunks;
738     size_t key_len, value_len;
739     char *cp;
740 
741     /* Find/create the appropriate chunk */
742     if (!ch) {
743 	text_chunks = ztr_find_chunks(z, ZTR_TYPE_TEXT, &ntext_chunks);
744 	if (!text_chunks) {
745 	    ch = ztr_new_chunk(z, ZTR_TYPE_TEXT, NULL, 0, NULL, 0);
746 	} else {
747 	    ch = text_chunks[0];
748 	    xfree(text_chunks);
749 	}
750     }
751 
752     if (ch->type != ZTR_TYPE_TEXT)
753 	return NULL;
754 
755     /* Make sure it's not compressed */
756     uncompress_chunk(z, ch);
757 
758     /* Append key\0value\0 */
759     key_len = strlen(key);
760     value_len = strlen(value);
761     cp = ch->data;
762     if (cp) {
763 	/* Set ch->dlength to the last non-nul byte of the previous value */
764 	while (ch->dlength && ch->data[ch->dlength-1] == 0)
765 	    ch->dlength--;
766     }
767 
768     cp = realloc(ch->data, 1 + ch->dlength + key_len + value_len + 3);
769     if (NULL == cp)
770 	return NULL;
771     else
772 	ch->data = cp;
773 
774     cp = &ch->data[ch->dlength];
775 
776     /*
777      * Note this is a bit cryptic, but it works.
778      * When appending to an existing text chunk we write a preceeding nul
779      * to mark the end of the previous value (we rewound above specifically
780      * for this case).
781      * When creating a new chunk we still write a nul, but in this case it's
782      * the RAW format byte.  After the value we add an extra nul to
783      * indicate the last entry.
784      */
785     ch->dlength += 1+sprintf(cp, "%c%s%c%s%c", 0, key, 0, value, 0);
786 
787     return ch;
788 }
789 
790 
791 
792 /*
793  * Stores held ztr huffman_codes as ZTR chunks.
794  * Returns 0 for success
795  *        -1 for failure
796  */
ztr_store_hcodes(ztr_t * ztr)797 int ztr_store_hcodes(ztr_t *ztr) {
798     int i;
799     ztr_chunk_t *chunks;
800     int nchunks;
801 
802     if (ztr->nhcodes == 0)
803 	return 0;
804 
805     /* Extend chunks array */
806     nchunks = ztr->nchunks + ztr->nhcodes;
807     chunks = (ztr_chunk_t *)realloc(ztr->chunk, nchunks * sizeof(*chunks));
808     if (!chunks)
809 	return -1;
810     ztr->chunk = chunks;
811 
812     /* Encode */
813     for (i = 0; i < ztr->nhcodes; i++) {
814 	block_t *blk = block_create(NULL, 2);
815 	int j = ztr->nchunks;
816 	unsigned char bytes[2];
817 
818 	ztr->chunk[j].type = ZTR_TYPE_HUFF;
819 	ztr->chunk[j].mdata = 0;
820 	ztr->chunk[j].mdlength = 0;
821 	ztr->chunk[j].ztr_owns = 1;
822 	bytes[0] = 0;
823 	bytes[1] = ztr->hcodes[i].codes->code_set;
824 	store_bytes(blk, bytes, 2);
825 	/* FIXME: Now already cached in ztr_hcode_t */
826 	if (0 == store_codes(blk, ztr->hcodes[i].codes, 1)) {
827 	    /* Last byte is always merged with first of stream */
828 	    if (blk->bit == 0) {
829 		unsigned char zero = 0;
830 		store_bytes(blk, &zero, 1);
831 	    }
832 
833 	    ztr->chunk[j].data = (char *)blk->data;
834 	    ztr->chunk[j].dlength = blk->byte + (blk->bit != 0);
835 	    block_destroy(blk, 1);
836 	    ztr->nchunks++;
837 	}
838     }
839 
840     return ztr->nchunks == nchunks ? 0 : -1;
841 }
842 
843 /*
844  * Given a ZTR chunk this searches through the meta-data key/value pairings
845  * to return the corresponding value.
846  *
847  * Returns a pointer into the mdata on success (nul-terminated)
848  *         NULL on failure.
849  */
ztr_lookup_mdata_value(ztr_t * z,ztr_chunk_t * chunk,char * key)850 char *ztr_lookup_mdata_value(ztr_t *z, ztr_chunk_t *chunk, char *key) {
851     if (z->header.version_major > 1 ||
852 	z->header.version_minor >= 2) {
853 	/* ZTR format 1.2 onwards */
854 	char *cp = chunk->mdata;
855 	int32_t dlen = chunk->mdlength;
856 
857 	/*
858 	 * NB: we may wish to rewrite this using a dedicated state machine
859 	 * instead of strlen/strcmp as this currently assumes the meta-
860 	 * data is correctly formatted, which we cannot assume as the
861 	 * metadata is external and outside of our control.
862 	 * Passing in non-nul terminated strings could crash this code.
863 	 */
864 	while (dlen > 0) {
865 	    size_t l;
866 	    int found;
867 
868 	    /* key */
869 	    l = strlen(cp);
870 	    found = strcmp(cp, key) == 0;
871 	    cp += l+1;
872 	    dlen -= l+1;
873 
874 	    /* value */
875 	    if (found)
876 		return cp;
877 	    l = strlen(cp);
878 	    cp += l+1;
879 	    dlen -= l+1;
880 	}
881 	return NULL;
882 
883     } else {
884 	/* v1.1 and before only supported a few types, specifically coded
885 	 * per chunk type.
886 	 */
887 	switch (chunk->type) {
888 	case ZTR_TYPE_SAMP:
889 	case ZTR_TYPE_SMP4:
890 	    if (strcmp(key, "TYPE"))
891 		return chunk->mdata;
892 	    break;
893 
894 	default:
895 	    break;
896 	}
897     }
898 
899     return NULL;
900 }
901 
902 /*
903  * Compresses an individual chunk using a specific format. The format is one
904  * of the 'format' fields listed in the spec; one of the ZTR_FORM_ macros.
905  */
compress_chunk(ztr_t * ztr,ztr_chunk_t * chunk,int format,int option,int option2)906 int compress_chunk(ztr_t *ztr, ztr_chunk_t *chunk, int format,
907 		   int option, int option2) {
908     char *new_data = NULL;
909     int new_len;
910 
911     switch (format) {
912     case ZTR_FORM_RAW:
913 	return 0;
914 
915     case ZTR_FORM_RLE:
916 	new_data = rle(chunk->data, chunk->dlength, option, &new_len);
917 	if (entropy((unsigned char *)new_data, new_len) >=
918 	    entropy((unsigned char *)chunk->data, chunk->dlength)) {
919 	    xfree(new_data);
920 	    return 0;
921 	}
922 	break;
923 
924     case ZTR_FORM_XRLE:
925 	new_data = xrle(chunk->data, chunk->dlength, option,option2, &new_len);
926 	break;
927 
928     case ZTR_FORM_XRLE2:
929 	new_data = xrle2(chunk->data, chunk->dlength, option, &new_len);
930 	break;
931 
932     case ZTR_FORM_ZLIB:
933 	new_data = zlib_huff(chunk->data, chunk->dlength, option, &new_len);
934 	break;
935 
936     case ZTR_FORM_DELTA1:
937 	new_data = decorrelate1(chunk->data, chunk->dlength, option, &new_len);
938 	break;
939 
940     case ZTR_FORM_DDELTA1:
941 	new_data = decorrelate1dyn(chunk->data, chunk->dlength, &new_len);
942 	break;
943 
944     case ZTR_FORM_DELTA2:
945 	new_data = decorrelate2(chunk->data, chunk->dlength, option, &new_len);
946 	break;
947 
948     case ZTR_FORM_DDELTA2:
949 	new_data = decorrelate2dyn(chunk->data, chunk->dlength, &new_len);
950 	break;
951 
952     case ZTR_FORM_DELTA4:
953 	new_data = decorrelate4(chunk->data, chunk->dlength, option, &new_len);
954 	break;
955 
956     case ZTR_FORM_16TO8:
957 	new_data = shrink_16to8(chunk->data, chunk->dlength, &new_len);
958 	break;
959 
960     case ZTR_FORM_32TO8:
961 	new_data = shrink_32to8(chunk->data, chunk->dlength, &new_len);
962 	break;
963 
964     case ZTR_FORM_FOLLOW1:
965 	new_data = follow1(chunk->data, chunk->dlength, &new_len);
966 	break;
967 
968     case ZTR_FORM_ICHEB:
969 	new_data = ichebcomp(chunk->data, chunk->dlength, &new_len);
970 	break;
971 
972     case ZTR_FORM_LOG2:
973 	new_data = log2_data(chunk->data, chunk->dlength, &new_len);
974 	break;
975 
976     case ZTR_FORM_STHUFF:
977 	new_data = sthuff(ztr, chunk->data, chunk->dlength,
978 			  option, option2, &new_len);
979 	break;
980 
981     case ZTR_FORM_QSHIFT:
982 	new_data = qshift(chunk->data, chunk->dlength, &new_len);
983 	break;
984 
985     case ZTR_FORM_TSHIFT:
986 	new_data = tshift(ztr, chunk->data, chunk->dlength, &new_len);
987 	break;
988     }
989 
990     if (!new_data) {
991 	fprintf(stderr, "!!ERROR!!\n");
992 	return -1;
993     }
994 
995     /*
996     fprintf(stderr, "Format %d => %d to %d\n", format, chunk->dlength, new_len);
997     */
998 
999     chunk->dlength = new_len;
1000     xfree(chunk->data);
1001     chunk->data = new_data;
1002 
1003     return 0;
1004 }
1005 
1006 /*
1007  * Uncompresses an individual chunk from all levels of compression.
1008  */
uncompress_chunk(ztr_t * ztr,ztr_chunk_t * chunk)1009 int uncompress_chunk(ztr_t *ztr, ztr_chunk_t *chunk) {
1010     char *new_data = NULL;
1011     int new_len;
1012 
1013     while (chunk->dlength > 0 && chunk->data[0] != ZTR_FORM_RAW) {
1014 	switch (chunk->data[0]) {
1015 	case ZTR_FORM_RLE:
1016 	    new_data = unrle(chunk->data, chunk->dlength, &new_len);
1017 	    break;
1018 
1019 	case ZTR_FORM_XRLE:
1020 	    new_data = unxrle(chunk->data, chunk->dlength, &new_len);
1021 	    break;
1022 
1023 	case ZTR_FORM_XRLE2:
1024 	    new_data = unxrle2(chunk->data, chunk->dlength, &new_len);
1025 	    break;
1026 
1027 	case ZTR_FORM_ZLIB:
1028 	    new_data = zlib_dehuff(chunk->data, chunk->dlength, &new_len);
1029 	    break;
1030 
1031 	case ZTR_FORM_DELTA1:
1032 	    new_data = recorrelate1(chunk->data, chunk->dlength, &new_len);
1033 	    break;
1034 
1035 	case ZTR_FORM_DELTA2:
1036 	    new_data = recorrelate2(chunk->data, chunk->dlength, &new_len);
1037 	    break;
1038 
1039 	case ZTR_FORM_DELTA4:
1040 	    new_data = recorrelate4(chunk->data, chunk->dlength, &new_len);
1041 	    break;
1042 
1043 	case ZTR_FORM_16TO8:
1044 	    new_data = expand_8to16(chunk->data, chunk->dlength, &new_len);
1045 	    break;
1046 
1047 	case ZTR_FORM_32TO8:
1048 	    new_data = expand_8to32(chunk->data, chunk->dlength, &new_len);
1049 	    break;
1050 
1051 	case ZTR_FORM_FOLLOW1:
1052 	    new_data = unfollow1(chunk->data, chunk->dlength, &new_len);
1053 	    break;
1054 
1055 	case ZTR_FORM_ICHEB:
1056 	    new_data = ichebuncomp(chunk->data, chunk->dlength, &new_len);
1057 	    break;
1058 
1059 	case ZTR_FORM_LOG2:
1060 	    new_data = unlog2_data(chunk->data, chunk->dlength, &new_len);
1061 	    break;
1062 
1063 	case ZTR_FORM_STHUFF:
1064 	    new_data = unsthuff(ztr, chunk->data, chunk->dlength, &new_len);
1065 	    break;
1066 
1067 	case ZTR_FORM_QSHIFT:
1068 	    new_data = unqshift(chunk->data, chunk->dlength, &new_len);
1069 	    break;
1070 
1071 	case ZTR_FORM_TSHIFT:
1072 	    new_data = untshift(ztr, chunk->data, chunk->dlength, &new_len);
1073 	    break;
1074 
1075 	default:
1076 	    fprintf(stderr, "Unknown encoding format %d\n", chunk->data[0]);
1077 	    return -1;
1078 	}
1079 
1080 	if (!new_data)
1081 	    return -1;
1082 
1083 	/*
1084 	fprintf(stderr, "format %d => %d to %d\n",
1085 		chunk->data[0], chunk->dlength, new_len);
1086 	*/
1087 
1088 	chunk->dlength = new_len;
1089 	xfree(chunk->data);
1090 	chunk->data = new_data;
1091     }
1092 
1093     return 0;
1094 }
1095 
1096 /*
1097  * Compresses a ztr (in memory).
1098  * Level is 0, 1, 2 or 3 (no compression, delta, delta + zlib,
1099  * chebyshev + zlib).
1100  */
compress_ztr(ztr_t * ztr,int level)1101 int compress_ztr(ztr_t *ztr, int level) {
1102     int i;
1103 
1104     if (0 == level)
1105 	return 0;
1106 
1107     for (i = 0; i < ztr->nchunks; i++) {
1108 	/*
1109 	{
1110 	    char str[5];
1111 	    fprintf(stderr, "---- %.4s ----\n",
1112 		    ZTR_BE2STR(ztr->chunk[i].type,str));
1113 	}
1114 	fprintf(stderr, "Uncomp length=%d\n", ztr->chunk[i].dlength);
1115 	*/
1116 
1117 	switch(ztr->chunk[i].type) {
1118 	    char *type;
1119 	case ZTR_TYPE_SAMP:
1120 	case ZTR_TYPE_SMP4:
1121 #ifdef ILLUMINA_GA
1122 	    compress_chunk(ztr, &ztr->chunk[i],
1123 			   ZTR_FORM_STHUFF, CODE_TRACES, 0);
1124 #else
1125 	    type = ztr_lookup_mdata_value(ztr, &ztr->chunk[i], "TYPE");
1126 	    if (type && 0 == strcmp(type, "PYRW")) {
1127 		/* Raw data is not really compressable */
1128 	    } else if (type && 0 == strcmp(type, "PYNO")) {
1129 		if (level > 1) {
1130 		    compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_16TO8,  0, 0);
1131 		    compress_chunk(ztr, &ztr->chunk[i],
1132 				   ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0);
1133 		}
1134 	    } else {
1135 		if (level <= 2) {
1136 		    /*
1137 		     * Experiments show that typically a double delta does
1138 		     * better than a single delta for 8-bit data, and the other
1139 		     * way around for 16-bit data
1140 		     */
1141 		    compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_DELTA2,
1142 				   ztr->delta_level, 0);
1143 		} else {
1144 		    compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_ICHEB,  0, 0);
1145 		}
1146 
1147 		compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_16TO8,  0, 0);
1148 		if (level > 1) {
1149 		    compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_FOLLOW1,0, 0);
1150 		    /*
1151 		      compress_chunk(ztr, &ztr->chunk[i],
1152 		                     ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY);
1153 		    */
1154 		    compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_RLE,  150, 0);
1155 		    compress_chunk(ztr, &ztr->chunk[i],
1156 		    		   ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0);
1157 		}
1158 	    }
1159 #endif
1160 	    break;
1161 
1162 	case ZTR_TYPE_BASE:
1163 #ifdef ILLUMINA_GA
1164 	    compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_STHUFF, CODE_DNA, 0);
1165 #else
1166 	    if (level > 1) {
1167 		compress_chunk(ztr, &ztr->chunk[i],
1168 			       ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0);
1169 	    }
1170 #endif
1171 	    break;
1172 
1173 	case ZTR_TYPE_CNF1:
1174 	case ZTR_TYPE_CNF4:
1175 	case ZTR_TYPE_CSID:
1176 #ifdef ILLUMINA_GA
1177 	    compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_RLE,  77, 0);
1178 	    compress_chunk(ztr, &ztr->chunk[i],
1179 			   ZTR_FORM_STHUFF, CODE_CONF_RLE, 0);
1180 #else
1181 	    compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_DELTA1, 1, 0);
1182 	    compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_RLE,  77, 0);
1183 	    if (level > 1) {
1184 		compress_chunk(ztr, &ztr->chunk[i],
1185 			       ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0);
1186 	    }
1187 #endif
1188 	    break;
1189 
1190 	case ZTR_TYPE_BPOS:
1191 	    compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_DELTA4, 1, 0);
1192 	    compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_32TO8,  0, 0);
1193 	    if (level > 1) {
1194 		compress_chunk(ztr, &ztr->chunk[i],
1195 			       ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0);
1196 	    }
1197 	    break;
1198 
1199 	case ZTR_TYPE_TEXT:
1200 #ifdef ILLUMINA_GA
1201 #else
1202 	    if (level > 1) {
1203 		compress_chunk(ztr, &ztr->chunk[i],
1204 			       ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0);
1205 	    }
1206 #endif
1207 	    break;
1208 
1209 	case ZTR_TYPE_FLWO:
1210 	    compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_XRLE, 0, 4);
1211 	    break;
1212 
1213 	}
1214 
1215 	/*
1216 	fprintf(stderr, "Comp length=%d\n", ztr->chunk[i].dlength);
1217 	*/
1218     }
1219 
1220     return 0;
1221 }
1222 
1223 /*
1224  * Uncompresses a ztr (in memory).
1225  */
uncompress_ztr(ztr_t * ztr)1226 int uncompress_ztr(ztr_t *ztr) {
1227     int i;
1228 
1229     for (i = 0; i < ztr->nchunks; i++) {
1230 	/*
1231 	{
1232             char str[5];
1233 	    fprintf(stderr, "---- %.4s ----\n",
1234 		    ZTR_BE2STR(ztr->chunk[i].type,str));
1235 	}
1236 	*/
1237 	uncompress_chunk(ztr, &ztr->chunk[i]);
1238     }
1239 
1240     return 0;
1241 }
1242