1 /*
2 * Copyright (c) 2007-2009, 2013 Genome Research Ltd.
3 * Author(s): James Bonfield
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following
13 * disclaimer in the documentation and/or other materials provided
14 * with the distribution.
15 *
16 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17 * Institute nor the names of its contributors may be used to endorse
18 * or promote products derived from this software without specific
19 * prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #ifdef HAVE_CONFIG_H
35 #include "io_lib_config.h"
36 #endif
37
38 #include <stdio.h>
39 #include <assert.h>
40 #include <string.h>
41 #include "io_lib/Read.h"
42 #include "io_lib/misc.h"
43 #include "io_lib/ztr.h"
44 #include "io_lib/hash_table.h"
45 #include "io_lib/xalloc.h"
46
47 #include "io_lib/srf.h"
48
49 /*
50 * ---------------------------------------------------------------------------
51 * Object creation / destruction.
52 */
53
54 /*
55 * Allocates and returns an srf_t structure.
56 * fp is the file point associated with this SRF file. Pass as NULL if unknown
57 * at this stage.
58 *
59 * Returns malloced memory on success
60 * NULL on failure
61 */
srf_create(FILE * fp)62 srf_t *srf_create(FILE *fp) {
63 srf_t *srf = (srf_t *)calloc(1, sizeof(*srf));
64
65 if (srf)
66 srf->fp = fp;
67 return srf;
68 }
69
70 /*
71 * Opens an SRF archive for reading or writing as defined by 'mode'. Mode is
72 * passed directly on to fopen() and so uses the same flags.
73 *
74 * Returns a srf_t struct pointer on success.
75 * NULL on failure
76 */
srf_open(char * fn,char * mode)77 srf_t *srf_open(char *fn, char *mode) {
78 FILE *fp;
79 char bmode[11];
80 size_t l, i;
81
82 /* Enforce binary mode for windows */
83 if ((l = strlen(mode)) < 9) {
84 int binary = 0;
85 for (i = 0; i < l; i++) {
86 if ('b' == (bmode[i] = mode[i]))
87 binary=1;
88 }
89 if (!binary)
90 bmode[i++] = 'b';
91 bmode[i] = 0;
92 mode = bmode;
93 }
94
95 return (fp = fopen(fn, mode)) ? srf_create(fp) : NULL;
96 }
97
98 /*
99 * Deallocates an srf_t struct. If auto_close is true then it also closes
100 * any associated FILE pointer.
101 */
srf_destroy(srf_t * srf,int auto_close)102 void srf_destroy(srf_t *srf, int auto_close) {
103 if (!srf)
104 return;
105
106 if (auto_close && srf->fp) {
107 if (-1 == fclose(srf->fp))
108 perror("fclose(srf->fp)");
109 }
110
111 if(srf->th.trace_hdr)
112 free(srf->th.trace_hdr);
113
114 if (srf->mf)
115 mfdestroy(srf->mf);
116
117 if (srf->ztr)
118 delete_ztr(srf->ztr);
119
120 free(srf);
121 }
122
123 /*
124 * ---------------------------------------------------------------------------
125 * Base data type I/O.
126 */
127
128 /*
129 * Writes a null-terminated C string in pascal-string form.
130 * Returns the number of bytes written or
131 * -1 for failure.
132 */
srf_write_pstring(srf_t * srf,char * str)133 int srf_write_pstring(srf_t *srf, char *str) {
134 size_t l = str ? strlen(str) : 0;
135 if (l > 255)
136 return -1;
137
138 if (l)
139 return fprintf(srf->fp, "%c%s", (int)l, str);
140 else
141 return fprintf(srf->fp, "%c", (int)l);
142 }
143
144 /*
145 * As per srf_write_pstring but 'str' may hold binary data including nul
146 * chars, hence we pass over length as a separate paremeter.
147 */
srf_write_pstringb(srf_t * srf,char * str,int length)148 int srf_write_pstringb(srf_t *srf, char *str, int length) {
149 if (length > 255 || length < 0)
150 return -1;
151
152 if (length)
153 return fprintf(srf->fp, "%c%s", (int)length, str);
154 else
155 return fprintf(srf->fp, "%c", (int)length);
156 }
157
158 /*
159 * Reads a pascal-style string from the srf file.
160 * 'str' passed in needs to be at least 256 bytes long. The string read
161 * will be stored there and nul terminated.
162 *
163 * Returns the length of the string read (minus nul char),
164 * -1 for failure
165 */
srf_read_pstring(srf_t * srf,char * str)166 int srf_read_pstring(srf_t *srf, char *str) {
167 int len;
168
169 if (EOF == (len = fgetc(srf->fp)))
170 return -1;
171 if (len != fread(str, 1, len, srf->fp))
172 return -1;
173 str[len] = '\0';
174
175 return len;
176 }
177
178 /*
179 * Read/write unsigned 32-bit and 64-bit values in big-endian format
180 * (ie the same as ZTR and SCF endian uses).
181 * Functions all return 0 for success, -1 for failure.
182 */
srf_read_uint32(srf_t * srf,uint32_t * val)183 int srf_read_uint32(srf_t *srf, uint32_t *val) {
184 unsigned char d[4];
185 if (1 != fread(d, 4, 1, srf->fp))
186 return -1;
187
188 *val = (d[0] << 24) | (d[1] << 16) | (d[2] << 8) | (d[3] << 0);
189 return 0;
190 }
191
srf_write_uint32(srf_t * srf,uint32_t val)192 int srf_write_uint32(srf_t *srf, uint32_t val) {
193 unsigned char d[4];
194 d[0] = (val >> 24) & 0xff;
195 d[1] = (val >> 16) & 0xff;
196 d[2] = (val >> 8) & 0xff;
197 d[3] = (val >> 0) & 0xff;
198
199 return fwrite(d, 4, 1, srf->fp) ? 0 : -1;
200 }
201
srf_read_uint64(srf_t * srf,uint64_t * val)202 int srf_read_uint64(srf_t *srf, uint64_t *val) {
203 unsigned char d[8];
204 if (1 != fread(d, 8, 1, srf->fp))
205 return -1;
206
207 *val = ((uint64_t)d[0] << 56)
208 | ((uint64_t)d[1] << 48)
209 | ((uint64_t)d[2] << 40)
210 | ((uint64_t)d[3] << 32)
211 | ((uint64_t)d[4] << 24)
212 | ((uint64_t)d[5] << 16)
213 | ((uint64_t)d[6] << 8)
214 | ((uint64_t)d[7] << 0);
215 return 0;
216 }
217
srf_write_uint64(srf_t * srf,uint64_t val)218 int srf_write_uint64(srf_t *srf, uint64_t val) {
219 unsigned char d[8];
220 d[0] = (val >> 56) & 0xff;
221 d[1] = (val >> 48) & 0xff;
222 d[2] = (val >> 40) & 0xff;
223 d[3] = (val >> 32) & 0xff;
224 d[4] = (val >> 24) & 0xff;
225 d[5] = (val >> 16) & 0xff;
226 d[6] = (val >> 8) & 0xff;
227 d[7] = (val >> 0) & 0xff;
228
229 return fwrite(d, 8, 1, srf->fp) ? 0 : -1;
230 }
231
232 /*
233 * ---------------------------------------------------------------------------
234 * Mid level I/O - srf block type handling
235 */
236
237
238 /*
239 * Allocates and initialises a container header structure. An existing
240 * srf_cont_hdr_t may be passed in to avoid allocation of a new object.
241 *
242 * Returns: allocated cont_header on success, to be freed using
243 * srf_destroy_cont_hdr() (if allocated here),
244 * NULL on failure.
245 */
srf_construct_cont_hdr(srf_cont_hdr_t * ch,char * bc,char * bc_version)246 srf_cont_hdr_t *srf_construct_cont_hdr(srf_cont_hdr_t *ch,
247 char *bc,
248 char *bc_version) {
249 if (!ch) {
250 if (NULL == (ch = (srf_cont_hdr_t *)calloc(1, sizeof(*ch))))
251 return NULL;
252 }
253
254 ch->block_type = SRFB_CONTAINER;
255 strcpy(ch->version, SRF_VERSION);
256 ch->container_type = 'Z';
257 strncpy(ch->base_caller, bc, 255);
258 strncpy(ch->base_caller_version, bc_version, 255);
259
260 return ch;
261 }
262
263 /*
264 * Deallocates an srf_cont_hdr_t constructed by srf_construct_cont_hdr().
265 */
srf_destroy_cont_hdr(srf_cont_hdr_t * ch)266 void srf_destroy_cont_hdr(srf_cont_hdr_t *ch) {
267 if (ch)
268 free(ch);
269 }
270
271 /*
272 * Reads a container header and stores the result in 'ch'.
273 * Returns 0 for success
274 * -1 for failure
275 */
srf_read_cont_hdr(srf_t * srf,srf_cont_hdr_t * ch)276 int srf_read_cont_hdr(srf_t *srf, srf_cont_hdr_t *ch) {
277 char magic[3];
278 uint32_t sz;
279
280 if (!ch)
281 return -1;
282
283 /* Check block type */
284 if (EOF == (ch->block_type = fgetc(srf->fp)))
285 return -1;
286 if (ch->block_type != SRFB_CONTAINER)
287 return -1;
288
289 /* Check magic number && version */
290 if (3 != fread(magic, 1, 3, srf->fp))
291 return -1;
292 if (0 != srf_read_uint32(srf, &sz))
293 return -1;
294 if (srf_read_pstring(srf, ch->version) < 0)
295 return -1;
296 if (strncmp(magic, "SRF", 3) || strcmp(ch->version, SRF_VERSION))
297 return -1;
298
299 /* Containter type, base caller bits */
300 if (EOF == (ch->container_type = fgetc(srf->fp)) ||
301 srf_read_pstring(srf, ch->base_caller) < 0||
302 srf_read_pstring(srf, ch->base_caller_version) < 0)
303 return -1;
304
305 return 0;
306 }
307
308 /*
309 * Writes a container header to disk.
310 *
311 * 4 Block type + magic number ("SSRF")
312 * 1+n: pString for version number ("1.0")
313 * 1: "Z" => container type is ZTR
314 * 1+n: pString for base-caller ("eg Bustard")
315 * 1+n: pString for base-caller version (eg "1.8.28")
316 * 4: uint32 distance from start of header to first data block.
317 * 4: uint32 distance from start of block to index block (0). FIXME
318 *
319 * Returns 0 for success
320 * -1 for failure
321 */
srf_write_cont_hdr(srf_t * srf,srf_cont_hdr_t * ch)322 int srf_write_cont_hdr(srf_t *srf, srf_cont_hdr_t *ch) {
323 uint32_t sz = 0;
324
325 if (!ch)
326 return -1;
327
328 /* Magic number && version */
329 if (4 != fwrite(SRF_MAGIC, 1, 4, srf->fp))
330 return -1;
331
332 /* Header size */
333 sz = 9
334 + strlen(ch->version) + 1
335 + strlen(ch->base_caller) + 1
336 + strlen(ch->base_caller_version) + 1;
337 if (0 != srf_write_uint32(srf, sz))
338 return -1;
339
340 if (srf_write_pstring(srf, ch->version) < 0)
341 return -1;
342
343 /* Containter type, base caller bits */
344 if (EOF == fputc(ch->container_type, srf->fp))
345 return -1;
346 if (srf_write_pstring(srf, ch->base_caller) < 0)
347 return -1;
348 if (srf_write_pstring(srf, ch->base_caller_version) < 0)
349 return -1;
350
351 return ferror(srf->fp) ? -1 : 0;
352 }
353
354 /*
355 * Reads an XML TraceInfo block
356 * Returns 0 for success
357 * -1 for failure
358 */
srf_read_xml(srf_t * srf,srf_xml_t * xml)359 int srf_read_xml(srf_t *srf, srf_xml_t *xml) {
360 int block_type;
361
362 if (EOF == (block_type = fgetc(srf->fp)))
363 return -1;
364 if (block_type != SRFB_XML)
365 return -1;
366
367 if (0 != srf_read_uint32(srf, &xml->xml_len))
368 return -1;
369 xml->xml_len -= 5;
370
371 if (NULL == (xml->xml = (char *)realloc(xml->xml, xml->xml_len+1)))
372 return -1;
373 if (xml->xml_len != fread(xml->xml, 1, xml->xml_len, srf->fp))
374 return -1;
375 xml->xml[xml->xml_len] = 0;
376
377 return 0;
378 }
379
380 /*
381 * Writes an XML TraceInfo block
382 * Returns 0 for success
383 * -1 for failure
384 */
srf_write_xml(srf_t * srf,srf_xml_t * xml)385 int srf_write_xml(srf_t *srf, srf_xml_t *xml) {
386 if (!srf->fp)
387 return -1;
388
389 if (EOF == fputc(SRFB_XML, srf->fp))
390 return -1;
391
392 if (-1 == srf_write_uint32(srf, xml->xml_len+5))
393 return -1;
394
395 if (xml->xml_len != fwrite(xml->xml, 1, xml->xml_len, srf->fp))
396 return -1;
397
398 return ferror(srf->fp) ? -1 : 0;
399 }
400
401 /*
402 * Initialises a srf_trace_header_t and inserts some passed in values.
403 * If the supplied th is NULL then a new structure is allocated.
404 *
405 * Returns a pointer to the dh passed in (or allocated if NULL) on success
406 * NULL on failure
407 */
srf_construct_trace_hdr(srf_trace_hdr_t * th,char * prefix,unsigned char * header,uint32_t header_sz)408 srf_trace_hdr_t *srf_construct_trace_hdr(srf_trace_hdr_t *th,
409 char *prefix,
410 unsigned char *header,
411 uint32_t header_sz) {
412 if (!th) {
413 if (NULL == (th = (srf_trace_hdr_t *)calloc(1, sizeof(*th))))
414 return NULL;
415 }
416
417 th->block_type = SRFB_TRACE_HEADER;
418 strncpy(th->id_prefix, prefix, 255);
419 th->trace_hdr_size = header_sz;
420 th->trace_hdr = header;
421 th->read_prefix_type = 'E';
422
423 return th;
424 }
425
426 /*
427 * Deallocates a srf_trace_hdr_t if allocated by
428 * srf_construct_trace_hdr().
429 * Do not use this if you passed in a static srf_trace_hdr to the construct
430 * function.
431 */
srf_destroy_trace_hdr(srf_trace_hdr_t * th)432 void srf_destroy_trace_hdr(srf_trace_hdr_t *th) {
433 if (th) {
434 if (th->trace_hdr)
435 free(th->trace_hdr);
436 free(th);
437 }
438 }
439
440 /*
441 * Reads a data header and stores the result in 'th'.
442 * Returns 0 for success
443 * -1 for failure
444 */
srf_read_trace_hdr(srf_t * srf,srf_trace_hdr_t * th)445 int srf_read_trace_hdr(srf_t *srf, srf_trace_hdr_t *th) {
446 int z;
447
448 /* Check block type */
449 if (EOF == (th->block_type = fgetc(srf->fp)))
450 return -1;
451
452 if (th->block_type != SRFB_TRACE_HEADER)
453 return -1;
454 if (0 != srf_read_uint32(srf, &th->trace_hdr_size))
455 return -1;
456 th->trace_hdr_size -= 1 + 4 + 1;
457
458 /* Read-id prefix */
459 if (EOF == (th->read_prefix_type = fgetc(srf->fp)))
460 return -1;
461 if ((z = srf_read_pstring(srf, th->id_prefix)) < 0)
462 return -1;
463 th->trace_hdr_size -= z+1;
464
465 /* The data header itself */
466 if (th->trace_hdr)
467 free(th->trace_hdr);
468 if (th->trace_hdr_size) {
469 if (NULL == (th->trace_hdr = malloc(th->trace_hdr_size)))
470 return -1;
471 if (th->trace_hdr_size != fread(th->trace_hdr, 1,
472 th->trace_hdr_size, srf->fp)) {
473 free(th->trace_hdr);
474 th->trace_hdr = NULL;
475 return -1;
476 }
477 } else {
478 th->trace_hdr = NULL;
479 }
480
481 return 0;
482 }
483
484 /*
485 * Writes a srf_trace_hdr_t structure to disk.
486 * Returns 0 for sucess
487 * -1 for failure
488 */
srf_write_trace_hdr(srf_t * srf,srf_trace_hdr_t * th)489 int srf_write_trace_hdr(srf_t *srf, srf_trace_hdr_t *th) {
490 uint32_t sz;
491
492 if (!srf->fp)
493 return -1;
494
495 if (EOF == fputc(th->block_type, srf->fp))
496 return -1;
497
498 /* Size */
499 sz = 1 + 4 + 1
500 + strlen(th->id_prefix) + 1
501 + th->trace_hdr_size;
502 if (-1 == srf_write_uint32(srf, sz))
503 return -1;
504
505 /* Prefix */
506 if (EOF == fputc(th->read_prefix_type, srf->fp))
507 return -1;
508 if (-1 == srf_write_pstring(srf, th->id_prefix))
509 return -1;
510
511 /* The ztr header blob itself... */
512 if (th->trace_hdr_size !=
513 fwrite(th->trace_hdr, 1, th->trace_hdr_size, srf->fp))
514 return -1;
515
516 return ferror(srf->fp) ? -1 : 0;
517 }
518
519
srf_construct_trace_body(srf_trace_body_t * tb,char * suffix,int suffix_len,unsigned char * body,uint32_t body_size,unsigned char flags)520 srf_trace_body_t *srf_construct_trace_body(srf_trace_body_t *tb,
521 char *suffix,
522 int suffix_len,
523 unsigned char *body,
524 uint32_t body_size,
525 unsigned char flags) {
526 if (!tb) {
527 if (NULL == (tb = (srf_trace_body_t *)calloc(1, sizeof(*tb))))
528 return NULL;
529 }
530 tb->block_type = SRFB_TRACE_BODY;
531 if (suffix_len == -1) {
532 suffix_len = strlen(suffix);
533 if (suffix_len > 255)
534 suffix_len = 255;
535 }
536 memcpy(tb->read_id, suffix, suffix_len);
537 tb->read_id[suffix_len] = 0;
538 tb->read_id_length = suffix_len;
539
540 tb->trace = body;
541 tb->trace_size = body_size;
542 tb->flags = flags;
543
544 return tb;
545 }
546
srf_destroy_trace_body(srf_trace_body_t * tb)547 void srf_destroy_trace_body(srf_trace_body_t *tb) {
548 if (tb)
549 free(tb);
550 }
551
552 /*
553 * Writes a new trace body.
554 *
555 * Returns: 0 on success
556 * -1 on failure
557 */
srf_write_trace_body(srf_t * srf,srf_trace_body_t * tb)558 int srf_write_trace_body(srf_t *srf, srf_trace_body_t *tb) {
559 uint32_t sz;
560
561 if (!srf->fp)
562 return -1;
563
564 if (EOF == fputc(tb->block_type, srf->fp))
565 return -1;
566
567 /* Size */
568 sz = 6 + tb->read_id_length+1 + tb->trace_size;
569 if (0 != srf_write_uint32(srf, sz))
570 return -1;
571
572 /* Flags and name */
573 if (EOF == (fputc(tb->flags, srf->fp)))
574 return -1;
575 if (-1 == srf_write_pstringb(srf, tb->read_id, tb->read_id_length))
576 return -1;
577
578 /* Tbe ztr footer blob itself... */
579 if (tb->trace_size != fwrite(tb->trace, 1, tb->trace_size, srf->fp))
580 return -1;
581
582 return ferror(srf->fp) ? -1 : 0;
583 }
584
585 /*
586 * Reads a trace header + trace 'blob' and stores the result in 'th'
587 * If no_trace is true then it skips loading the trace data itself.
588 *
589 * Returns 0 for success
590 * -1 for failure
591 */
srf_read_trace_body(srf_t * srf,srf_trace_body_t * tb,int no_trace)592 int srf_read_trace_body(srf_t *srf, srf_trace_body_t *tb, int no_trace) {
593 int z;
594
595 /* Check block type */
596 if (EOF == (tb->block_type = fgetc(srf->fp)))
597 return -1;
598 if (tb->block_type != SRFB_TRACE_BODY)
599 return -1;
600
601 /* Size */
602 if (0 != srf_read_uint32(srf, &tb->trace_size))
603 return -1;
604 tb->trace_size -= 6;
605
606 /* Flags */
607 if (EOF == (z = fgetc(srf->fp)))
608 return -1;
609 tb->flags = z;
610
611 /* Read-id suffix */
612 if ((z = srf_read_pstring(srf, tb->read_id)) < 0)
613 return -1;
614 tb->read_id_length = z;
615 tb->trace_size -= z+1;
616
617 /* The trace data itself */
618 if (!no_trace) {
619 if (tb->trace_size) {
620 if (NULL == (tb->trace = malloc(tb->trace_size)))
621 return -1;
622 if (tb->trace_size != fread(tb->trace, 1, tb->trace_size,
623 srf->fp)) {
624 free(tb->trace);
625 tb->trace = NULL;
626 return -1;
627 }
628 } else {
629 tb->trace = NULL;
630 }
631 } else {
632 /* Skip */
633 fseeko(srf->fp, tb->trace_size, SEEK_CUR);
634 tb->trace = NULL;
635 }
636
637 return 0;
638 }
639
640
641 /*
642 * Reads a SRF index header. See srf_write_index_hdr for the format.
643 * If no_seek is true it reads the header starting at the current file
644 * offset, otherwise it seeks to the end of the file and reads that
645 * header instead.
646 *
647 * Returns 0 on success and fills out *hdr
648 * -1 on failure
649 */
srf_read_index_hdr(srf_t * srf,srf_index_hdr_t * hdr,int no_seek)650 int srf_read_index_hdr(srf_t *srf, srf_index_hdr_t *hdr, int no_seek) {
651 int sz, z;
652
653 /* Load footer */
654 if (!no_seek) {
655 if (0 != fseeko(srf->fp, -16, SEEK_END))
656 return -1;
657
658 if (4 != fread(hdr->magic, 1, 4, srf->fp))
659 return -1;
660 if (4 != fread(hdr->version, 1, 4, srf->fp))
661 return -1;
662 if (0 != srf_read_uint64(srf, &hdr->size))
663 return -1;
664
665 /* Check for validity */
666 if (memcmp(hdr->magic, SRF_INDEX_MAGIC, 4) ||
667 memcmp(hdr->version, SRF_INDEX_VERSION, 4))
668 return -1;
669
670 /* Seek to index header and re-read */
671 if (0 != fseeko(srf->fp, -(int64_t)hdr->size, SEEK_END))
672 return -1;
673 }
674
675 if (4 != fread(hdr->magic, 1, 4, srf->fp))
676 return -1;
677 if (4 != fread(hdr->version, 1, 4, srf->fp))
678 return -1;
679 if (0 != srf_read_uint64(srf, &hdr->size))
680 return -1;
681
682 /* Check once more */
683 if (memcmp(hdr->magic, SRF_INDEX_MAGIC, 4) ||
684 memcmp(hdr->version, SRF_INDEX_VERSION, 4))
685 return -1;
686
687 /* And finally the remainder of the index header details */
688 if (EOF == (hdr->index_type = fgetc(srf->fp)))
689 return -1;
690 if (EOF == (hdr->dbh_pos_stored_sep = fgetc(srf->fp)))
691 return -1;
692
693 if (0 != srf_read_uint32(srf, &hdr->n_container))
694 return -1;
695 if (0 != srf_read_uint32(srf, &hdr->n_data_block_hdr))
696 return -1;
697 if (0 != srf_read_uint64(srf, &hdr->n_buckets))
698 return -1;
699
700 sz = 34; /* fixed size of the above records */
701
702 if ((z = srf_read_pstring(srf, hdr->dbh_file)) < 0)
703 return -1;
704 sz += z+1;
705 if ((z = srf_read_pstring(srf, hdr->cont_file)) < 0)
706 return -1;
707 sz += z+1;
708
709 hdr->index_hdr_sz = sz;
710
711 return 0;
712 }
713
714 /*
715 * Writes a SRF index header.
716 *
717 * Header:
718 * x4 magic number, starting with 'I'.
719 * x4 version code (eg "1.00")
720 * x8 index size
721 * x1 index type ('E' normally)
722 * x1 dbh_pos_stored_sep (indicates if the item list contains the
723 * "data block header" index number).
724 * x4 number of containers
725 * x4 number of DBHs
726 * x8 number of hash buckets
727 *
728 * x* dbhFile p-string (NULL if held within the same file)
729 * x* contFile p-string (NULL if held within the same file)
730 *
731 * Returns 0 on success
732 * -1 on failure
733 */
srf_write_index_hdr(srf_t * srf,srf_index_hdr_t * hdr)734 int srf_write_index_hdr(srf_t *srf, srf_index_hdr_t *hdr) {
735 if (4 != fwrite(hdr->magic, 1, 4, srf->fp))
736 return -1;
737 if (4 != fwrite(hdr->version, 1, 4, srf->fp))
738 return -1;
739 if (0 != srf_write_uint64(srf, hdr->size))
740 return -1;
741
742 if (EOF == fputc(hdr->index_type, srf->fp))
743 return -1;
744 if (EOF == fputc(hdr->dbh_pos_stored_sep, srf->fp))
745 return -1;
746
747 if (0 != srf_write_uint32(srf, hdr->n_container))
748 return -1;
749 if (0 != srf_write_uint32(srf, hdr->n_data_block_hdr))
750 return -1;
751 if (0 != srf_write_uint64(srf, hdr->n_buckets))
752 return -1;
753
754 if (-1 == srf_write_pstring(srf, hdr->dbh_file))
755 return -1;
756 if (-1 == srf_write_pstring(srf, hdr->cont_file))
757 return -1;
758
759 return ferror(srf->fp) ? -1 : 0;
760 }
761
762 /* Position in index - internal struct used for code below only */
763 typedef struct {
764 uint64_t pos;
765 uint32_t dbh;
766 } pos_dbh;
767
768 /*
769 * This allocates and initialises an srf_index_t struct filling out the
770 * fields to default values or the supplied parameters. It does not
771 * actually write anything to disc itself.
772 *
773 * Note: non-NULL values for ch_file and th_file are not implemented yet.
774 *
775 * ch_file is the container header file. If specified as non-NULL it is the
776 * name of the file storing the container and DB records (trace bodies).
777 * NULL implies all the data is in the same file.
778 *
779 * th_file is the filename where we store the DBH records (trace headers).
780 * NULL implies all the data is in the same file.
781 *
782 * dbh_sep is a boolean value used to indicate whether we store the
783 * location of DBH+DB per trace or just the DB record. The latter uses less
784 * space and is most generally used, but the former is required if DBH and DB
785 * are split apart into two files (ch_file and th_file).
786 *
787 * Returns srf_index_t pointer on success.
788 * NULL on failure
789 */
srf_index_create(char * ch_file,char * th_file,int dbh_sep)790 srf_index_t *srf_index_create(char *ch_file, char *th_file, int dbh_sep) {
791 srf_index_t *idx = (srf_index_t *)malloc(sizeof(srf_index_t));
792 if (!idx)
793 return NULL;
794
795 if (ch_file) {
796 strncpy(idx->ch_file, ch_file, PATH_MAX);
797 idx->ch_file[PATH_MAX] = 0;
798 } else {
799 idx->ch_file[0] = 0;
800 }
801
802 if (th_file) {
803 strncpy(idx->th_file, th_file, PATH_MAX);
804 idx->th_file[PATH_MAX] = 0;
805 } else {
806 idx->th_file[0] = 0;
807 }
808
809 idx->dbh_pos_stored_sep = dbh_sep;
810
811 /* Create the arrays and hash table */
812 if (!(idx->ch_pos = ArrayCreate(sizeof(uint64_t), 0)))
813 return NULL;
814
815 if (!(idx->th_pos = ArrayCreate(sizeof(uint64_t), 0)))
816 return NULL;
817
818 if (!(idx->name_blocks = ArrayCreate(sizeof(srf_name_block_t), 0)))
819 return NULL;
820
821 if (!(idx->db_hash = HashTableCreate(0, HASH_DYNAMIC_SIZE |
822 HASH_FUNC_JENKINS3 |
823 HASH_NONVOLATILE_KEYS |
824 HASH_POOL_ITEMS)))
825 return NULL;
826
827 return idx;
828 }
829
830
831 /*
832 * Deallocates memory used by an srf_index_t structure.
833 */
srf_index_destroy(srf_index_t * idx)834 void srf_index_destroy(srf_index_t *idx) {
835 size_t i;
836
837 if (!idx)
838 return;
839
840 if (idx->db_hash)
841 HashTableDestroy(idx->db_hash, 0);
842 if (idx->ch_pos)
843 ArrayDestroy(idx->ch_pos);
844 if (idx->th_pos)
845 ArrayDestroy(idx->th_pos);
846 if (idx->name_blocks) {
847 for (i = 0; i < ArrayMax(idx->name_blocks); i++) {
848 if (NULL != arr(srf_name_block_t, idx->name_blocks, i).names)
849 free(arr(srf_name_block_t, idx->name_blocks, i).names);
850 }
851 ArrayDestroy(idx->name_blocks);
852 }
853
854 free(idx);
855 }
856
857
858 /*
859 * Dumps out some statistics on the index to fp, or stderr if fp
860 * is NULL.
861 */
srf_index_stats(srf_index_t * idx,FILE * fp)862 void srf_index_stats(srf_index_t *idx, FILE *fp) {
863 HashTableStats(idx->db_hash, fp ? fp : stderr);
864 }
865
866
867 /*
868 * Adds a container header (CH block) to the srf index.
869 *
870 * Returns 0 on success
871 * -1 on failure
872 */
srf_index_add_cont_hdr(srf_index_t * idx,uint64_t pos)873 int srf_index_add_cont_hdr(srf_index_t *idx, uint64_t pos) {
874 uint64_t *ip = ARRP(uint64_t, idx->ch_pos, ArrayMax(idx->ch_pos));
875 return ip ? (*ip = pos, 0) : -1;
876 }
877
878
879 /*
880 * Adds a trace header (DBH block) to the srf index.
881 *
882 * Returns 0 on success
883 * -1 on failure
884 */
srf_index_add_trace_hdr(srf_index_t * idx,uint64_t pos)885 int srf_index_add_trace_hdr(srf_index_t *idx, uint64_t pos) {
886 uint64_t *ip = ARRP(uint64_t, idx->th_pos, ArrayMax(idx->th_pos));
887 return ip ? (*ip = pos, 0) : -1;
888 }
889
890
891 /*
892 * Adds a trace body (DB block) to the srf index.
893 *
894 * Returns 0 on success
895 * -1 on failure
896 */
srf_index_add_trace_body(srf_index_t * idx,char * name,uint64_t pos)897 int srf_index_add_trace_body(srf_index_t *idx, char *name, uint64_t pos) {
898 HashData hd;
899 pos_dbh *pdbh;
900 srf_name_block_t *blockp;
901 char *name_copy;
902 size_t name_len;
903 int new;
904
905 if (idx->dbh_pos_stored_sep) {
906 if (NULL == (pdbh = (pos_dbh *)malloc(sizeof(*pdbh))))
907 return -1;
908 pdbh->pos = pos;
909 pdbh->dbh = ArrayMax(idx->th_pos);
910 hd.p = pdbh;
911 } else {
912 hd.i = pos;
913 }
914
915 name_len = strlen(name) + 1; /* Include NULL */
916
917 /* Allocate more space for names if needed */
918 if (ArrayMax(idx->name_blocks) == 0
919 || arr(srf_name_block_t, idx->name_blocks,
920 ArrayMax(idx->name_blocks) - 1).space <= name_len) {
921 blockp = ARRP(srf_name_block_t, idx->name_blocks,
922 ArrayMax(idx->name_blocks));
923 if (NULL == blockp) return -1;
924
925 blockp->used = 0;
926 blockp->space = (name_len < SRF_INDEX_NAME_BLOCK_SIZE
927 ? SRF_INDEX_NAME_BLOCK_SIZE
928 : name_len);
929 blockp->names = xmalloc(blockp->space);
930 if (NULL == blockp->names) {
931 ArrayMax(idx->name_blocks)--;
932 return -1;
933 }
934 }
935
936 blockp = ARRP(srf_name_block_t, idx->name_blocks,
937 ArrayMax(idx->name_blocks) - 1);
938 name_copy = blockp->names + blockp->used;
939 memcpy(name_copy, name, name_len);
940 blockp->used += name_len;
941 blockp->space -= name_len;
942
943 if (NULL == HashTableAdd(idx->db_hash, name_copy, name_len - 1, hd, &new)){
944 return -1;
945 }
946 if (0 == new) {
947 fprintf(stderr, "duplicate read name %s\n", name);
948 return -1;
949 }
950
951 return 0;
952 }
953
954
955 /*
956 * Writes the HashTable structures to 'fp'.
957 * This is a specialisation of the HashTable where the HashData is a
958 * position,size tuple.
959 *
960 * Header:
961 * x4 magic number, starting with 'I'.
962 * x4 version code (eg "1.00")
963 * x8 index size (should be x4 as we assume bucket locs are x4?)
964 *
965 * x1 index type ('E' normally)
966 * x1 dbh_pos_stored_sep (indicates if the item list contains the
967 * "data block header" index number).
968 *
969 * x4 number of containers
970 * x4 number of DBHs
971 * x8 number of hash buckets
972 *
973 * x* dbhFile p-string (NULL if held within the same file)
974 * x* contFile p-string (NULL if held within the same file)
975 *
976 * Containers: (1 entry per container)
977 * x8 file position of container header
978 *
979 * Data Block Headers: (1 entry per DBH)
980 * x8 file position of container header
981 *
982 * Buckets: (1 entry per bucket)
983 * x8 8-byte offset of linked list pos, rel. to the start of the hdr
984 *
985 * Items: (1 per trace)
986 * x1 name disambiguation hash, top-most bit set => last item in list
987 * x8 data position
988 * (x4) (dbh_index - optional; present if dbh_pos_stored_sep is 1)
989 *
990 * Footer:
991 * x4 magic number
992 * x4 version
993 * x8 index size
994 *
995 * Returns: the number of bytes written on success
996 * -1 for error
997 */
srf_index_write(srf_t * srf,srf_index_t * idx)998 int srf_index_write(srf_t *srf, srf_index_t *idx) {
999 unsigned int i, j;
1000 srf_index_hdr_t hdr;
1001 uint64_t *bucket_pos;
1002 int item_sz;
1003 HashTable *h = idx->db_hash;
1004
1005 /* Option: whether to store dbh positions directly in the index */
1006 hdr.dbh_pos_stored_sep = idx->dbh_pos_stored_sep;
1007
1008 /* Compute index size and bucket offsets */
1009 hdr.size = 34 +
1010 1 + strlen(idx->ch_file) +
1011 1 + strlen(idx->th_file);
1012 hdr.size += 8*(ArrayMax(idx->ch_pos) +
1013 ArrayMax(idx->th_pos) +
1014 h->nbuckets);
1015 if (NULL == (bucket_pos = (uint64_t *)calloc(h->nbuckets,
1016 sizeof(*bucket_pos))))
1017 return -1;
1018 for (i = 0; i < h->nbuckets; i++) {
1019 HashItem *hi;
1020 if (!(hi = h->bucket[i]))
1021 continue;
1022 bucket_pos[i] = hdr.size;
1023 item_sz = 1 + 8 + (hdr.dbh_pos_stored_sep ? 4 : 0);
1024 for (; hi; hi = hi->next)
1025 hdr.size += item_sz;
1026 }
1027 hdr.size += 16; /* footer */
1028
1029 /* Construct and write out the index header */
1030 memcpy(hdr.magic, SRF_INDEX_MAGIC, 4);
1031 memcpy(hdr.version, SRF_INDEX_VERSION, 4);
1032 hdr.index_type = 'E';
1033 hdr.n_container = ArrayMax(idx->ch_pos);
1034 hdr.n_data_block_hdr = ArrayMax(idx->th_pos);
1035 hdr.n_buckets = h->nbuckets;
1036 strncpy(hdr.dbh_file, idx->th_file, 255);
1037 strncpy(hdr.cont_file, idx->ch_file, 255);
1038 if (0 != srf_write_index_hdr(srf, &hdr))
1039 return -1;
1040
1041 /* Write the container and data block header arrays */
1042 j = ArrayMax(idx->ch_pos);
1043 for (i = 0; i < j; i++) {
1044 if (0 != srf_write_uint64(srf, arr(uint64_t, idx->ch_pos, i)))
1045 return -1;
1046 }
1047
1048 j = ArrayMax(idx->th_pos);
1049 for (i = 0; i < j; i++) {
1050 if (0 != srf_write_uint64(srf, arr(uint64_t, idx->th_pos, i)))
1051 return -1;
1052 }
1053
1054 /* Write out buckets */
1055 for (i = 0; i < h->nbuckets; i++) {
1056 if (0 != srf_write_uint64(srf, bucket_pos[i]))
1057 return -1;
1058 }
1059
1060 /* Write out the trace locations themselves */
1061 for (i = 0; i < h->nbuckets; i++) {
1062 HashItem *hi;
1063 /*
1064 fprintf(stderr, "Bucket %d offset %lld vs %lld\n",
1065 i, ftell(srf->fp), bucket_pos[i]);
1066 */
1067 if (!(hi = h->bucket[i]))
1068 continue;
1069 for (; hi; hi = hi->next) {
1070 uint64_t pos;
1071 uint32_t dbh = 0;
1072 uint32_t h7;
1073
1074 if (hdr.dbh_pos_stored_sep) {
1075 pos_dbh *pdbh = (pos_dbh *)hi->data.p;
1076 pos = pdbh->pos;
1077 dbh = pdbh->dbh;
1078 } else {
1079 pos = hi->data.i;
1080 }
1081
1082 /* Rehash key in 7 bits; */
1083 h7 = hash64(h->options & HASH_FUNC_MASK,
1084 (uint8_t *)hi->key, hi->key_len) >> 57;
1085 if (!hi->next)
1086 h7 |= 0x80;
1087 /*
1088 fprintf(stderr, "\t%.*s => %x @ %lld\n",
1089 hi->key_len, hi->key, h7, pos);
1090 */
1091 if (fputc(h7, srf->fp) < 0)
1092 return -1;
1093 if (0 != srf_write_uint64(srf, pos))
1094 return -1;
1095
1096 if (hdr.dbh_pos_stored_sep)
1097 if (0 != srf_write_uint32(srf, dbh))
1098 return -1;
1099 }
1100 }
1101
1102 /* Footer */
1103 if (4 != fwrite(hdr.magic, 1, 4, srf->fp))
1104 return -1;
1105 if (4 != fwrite(hdr.version, 1, 4, srf->fp))
1106 return -1;
1107 if (0 != srf_write_uint64(srf, hdr.size))
1108 return -1;
1109
1110 free(bucket_pos);
1111
1112 return 0;
1113 }
1114
1115 /*
1116 * ---------------------------------------------------------------------------
1117 * Trace name codec details.
1118 */
1119
1120 /*
1121 * Reads up to 32-bits worth of data and returns. Updates the block
1122 * byte and bit values to indicate the current 'read' position.
1123 *
1124 * Returns unsigned value on success (>=0)
1125 * -1 on failure
1126 */
get_hi_bits(block_t * block,int nbits)1127 static uint32_t get_hi_bits(block_t *block, int nbits) {
1128 unsigned int val, bnum = 0;
1129
1130 if (block->byte*8 + block->bit + nbits > block->alloc * 8)
1131 return -1;
1132
1133 /* Fetch the partial byte of data */
1134 val = (block->data[block->byte]) & ((1<<(8-block->bit))-1);
1135 bnum = 8 - block->bit;
1136
1137 if (bnum >= nbits) {
1138 val >>= bnum-nbits;
1139 val &= (1<<nbits)-1;
1140 block->bit += nbits;
1141 return val;
1142 }
1143
1144 /* And additional entire bytes worth as required */
1145 while (bnum+8 <= nbits && bnum+8 < 32) {
1146 val <<= 8;
1147 val |= block->data[++block->byte];
1148 bnum += 8;
1149 }
1150
1151 /* The remaining partial byte */
1152 val <<= nbits-bnum;
1153 val |= (block->data[++block->byte] >> (8-(nbits-bnum)))
1154 & ((1<<(nbits-bnum))-1);
1155 block->bit = nbits-bnum;
1156
1157 return val;
1158 }
1159
1160 /*
1161 * Stores up to 32-bits of data in a block
1162 */
set_hi_bits(block_t * block,uint32_t val,int nbits)1163 static void set_hi_bits(block_t *block, uint32_t val, int nbits) {
1164 unsigned int curr = block->data[block->byte];
1165
1166 /* Pack first partial byte */
1167 if (nbits > 8-block->bit) {
1168 curr |= val >> (nbits -= 8-block->bit);
1169 block->data[block->byte] = curr;
1170 block->data[++block->byte] = 0;
1171 block->bit = 0;
1172 } else {
1173 curr |= val << (8-block->bit - nbits);
1174 block->data[block->byte] = curr;
1175 if ((block->bit += nbits) == 8) {
1176 block->bit = 0;
1177 block->data[++block->byte] = 0;
1178 }
1179 return;
1180 }
1181
1182 /* Handle whole bytes worth */
1183 while (nbits > 8) {
1184 block->data[block->byte++] = (val >> (nbits-=8)) & 0xff;
1185 }
1186
1187 /* And finally any remaining bits left */
1188 block->data[block->byte] = (val & ((1<<nbits) - 1)) << (8-nbits);
1189 block->bit = nbits;
1190 }
1191
1192
1193 /*
1194 * Formats are specified embedded in 'fmt' using a percent-rule, much
1195 * like printf().
1196 *
1197 * Both fmt and suffix are C-style nul terminated strings.
1198 *
1199 * The format consists of:
1200 *
1201 * '%' <field-width> <bits-used> <format-code>
1202 *
1203 * Field-width is a numerical value indicating the number of characters we
1204 * wish to print. It is optional as without specifying this we emit as
1205 * many characters as are needed to describe the data. If specified the
1206 * output is padded to be at least field-width in size. The padding character
1207 * may vary on the otuput format, but will typically be '0'.
1208 *
1209 * Bits-used consists of '.' (a full stop) followed by a numerical value
1210 * indicating the number of bits to read from the suffix, starting from
1211 * bit 0 or the next free bit following a previous format. If not specified
1212 * generally all bits are used (8 * suffix_len) unless otherwise indicated
1213 * below.
1214 *
1215 * Format-code may be one of:
1216 * 'd' decimal values (0-9)
1217 * 'o' octal values (0-7)
1218 * 'x' hexidecimal values, lowercase
1219 * 'X' hexidecimal values, uppercase
1220 * 'j' base-36 encoding, lowercase
1221 * 'J' base-36 encoding, uppercase (454)
1222 * 'c' a single character (default bits used = 8)
1223 * 's' string (all bits used, treated as ascii)
1224 * '%' a literal percent character (no bits used).
1225 *
1226 * Returns the number of bytes written to 'name' on success
1227 * -1 on failure
1228 */
1229 #define emit(c) \
1230 if (out_pos < name_len-1) { \
1231 name[out_pos++] = (c); \
1232 } else { \
1233 block_destroy(blk, 1);\
1234 return name_len; \
1235 }
1236
construct_trace_name(char * fmt,unsigned char * suffix,int suffix_len,char * name,int name_len)1237 int construct_trace_name(char *fmt,
1238 unsigned char *suffix, int suffix_len,
1239 char *name, int name_len) {
1240 block_t *blk = block_create(suffix, suffix_len);
1241 int out_pos = 0;
1242 int percent = 0;
1243
1244 /* Default nul-terminate for abort cases */
1245 name[name_len-1] = '\0';
1246
1247 for(; *fmt; fmt++) {
1248 switch(*fmt) {
1249
1250 /* A format specifier */
1251 case '%': {
1252 int width = 0;
1253 int bits = 0;
1254 uint32_t val;
1255
1256 fmt++;
1257 percent++;
1258
1259 /* Width specifier */
1260 if (0 == (width = strtol(fmt, &fmt, 10)))
1261 width = 1; /* minimum width */
1262
1263 /* Bit size specifier */
1264 if ('.' == *fmt) {
1265 fmt++;
1266 bits = strtol(fmt, &fmt, 10);
1267 }
1268
1269 /* The format code */
1270 switch (*fmt) {
1271 int i;
1272
1273 case '%':
1274 for (i = 0; i < width; i++) {
1275 emit('%');
1276 }
1277 break;
1278
1279 case 'o':
1280 case 'd':
1281 case 'x':
1282 case 'X':
1283 case 'j':
1284 case 'J': {
1285 /* One of the integer encoding formats */
1286 char *digits = "0123456789abcdef";
1287 int d = 0, tmp_ind = 0;
1288 char tmp[1024];
1289
1290 switch(*fmt) {
1291 case 'o': d=8; break;
1292 case 'd': d=10; break;
1293 case 'x': d=16; break;
1294
1295 case 'X':
1296 d=16;
1297 digits = "0123456789ABCDEF";
1298 break;
1299
1300 case 'j':
1301 d=36;
1302 digits = "abcdefghijklmnopqrstuvwxyz0123456789";
1303 break;
1304
1305 case 'J': /* Used by 454 */
1306 d=36;
1307 digits = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
1308 break;
1309 }
1310
1311 while (bits > 0) {
1312 int32_t sv;
1313 int nb = bits > 32 ? 32 : bits;
1314 if (-1 == (sv = get_hi_bits(blk, nb)))
1315 return -1;
1316 val = sv;
1317
1318 do {
1319 tmp[tmp_ind++] = digits[val % d];
1320 val /= d;
1321 } while (val != 0);
1322
1323 bits -= nb;
1324 }
1325
1326 /* Pad to requested size */
1327 for (i = width; i > tmp_ind; i--)
1328 emit(*digits);
1329
1330 /* Output the formatted value itself */
1331 do {
1332 emit(tmp[--tmp_ind]);
1333 } while (tmp_ind != 0);
1334
1335 break;
1336 }
1337
1338 case 'c':
1339 /* A single n-bit character */
1340 if (!bits)
1341 bits = 8;
1342 if (-1 == (val = get_hi_bits(blk, bits)))
1343 return -1;
1344
1345 emit(val);
1346 break;
1347
1348 case 's': {
1349 /* A string, to the end of suffix, of n-bit characters */
1350 if (!bits)
1351 bits = 8;
1352 /* Reading n-bits at a time to produce a string */
1353 while (-1 != (val = get_hi_bits(blk, bits)))
1354 emit(val);
1355 break;
1356 }
1357
1358 default:
1359 fprintf(stderr, "Unknown arg: %c\n", *fmt);
1360 }
1361
1362 }
1363
1364 case '\0':
1365 break;
1366
1367 default:
1368 emit(*fmt);
1369 }
1370 }
1371
1372 /*
1373 * No percent rule found implies the name is a simple string
1374 * concatenation of prefix and suffix
1375 */
1376 if (!percent) {
1377 int i;
1378
1379 /* A strncpy would be more efficient here */
1380 for (i = 0; suffix[i]; i++) {
1381 emit(suffix[i]);
1382 }
1383 }
1384
1385 emit('\0');
1386
1387 block_destroy(blk, 1);
1388 return out_pos;
1389 }
1390
1391 /*
1392 * The opposite of above.
1393 * Given a format and a set of arguments packs the values into the
1394 * supplied 'suffix' string. This should be 256 characters long and the
1395 * first byte will consist of the real length.
1396 *
1397 * Returns 0 on success
1398 * -1 on failuare
1399 */
pack_trace_suffix(unsigned char * suffix,char * fmt,...)1400 int pack_trace_suffix(unsigned char *suffix, char *fmt, ...) {
1401 block_t *blk = block_create(NULL, 256);
1402 va_list args;
1403
1404 va_start(args, fmt);
1405
1406 for(; *fmt; fmt++) {
1407 switch(*fmt) {
1408
1409 /* A format specifier */
1410 case '%': {
1411 int width __UNUSED__;
1412 int bits = 0;
1413 signed int val;
1414
1415 fmt++;
1416
1417 /* Width specifier - ignore this */
1418 width = strtol(fmt, &fmt, 10);
1419
1420 /* Bit size specifier */
1421 if ('.' == *fmt) {
1422 fmt++;
1423 bits = strtol(fmt, &fmt, 10);
1424 }
1425
1426 /* The format code */
1427 switch (*fmt) {
1428 case '%':
1429 /* A literal percent */
1430 break;
1431
1432 case 'o':
1433 case 'd':
1434 case 'x':
1435 case 'X':
1436 case 'j':
1437 case 'J':
1438 /* A numeric value - it doesn't matter what format it
1439 * is specified as, just how many bits.
1440 */
1441 val = (uint32_t)va_arg(args, int);
1442 set_hi_bits(blk, val, bits);
1443 break;
1444
1445 case 'c':
1446 if (!bits)
1447 bits = 8;
1448 val = (unsigned char)va_arg(args, int);
1449 set_hi_bits(blk, val, bits);
1450 break;
1451
1452 case 's': {
1453 char *s = (char *)va_arg(args, char *);
1454 if (!bits)
1455 bits = 8;
1456
1457 for (; *s; s++) {
1458 set_hi_bits(blk, *s, bits);
1459 }
1460 break;
1461 }
1462
1463 default:
1464 fprintf(stderr, "Unknown arg: %c\n", *fmt);
1465 }
1466 }
1467 }
1468 }
1469
1470 if (blk->byte >= 256)
1471 return -1;
1472
1473 *suffix = blk->byte + (blk->bit > 0);
1474 memcpy(suffix+1, blk->data, *suffix);
1475
1476 return 0;
1477 }
1478
1479
1480 /*
1481 * ---------------------------------------------------------------------------
1482 * Higher level I/O functions
1483 */
1484
1485 /*
1486 * Fetches the next trace from an SRF container as a "memory-FILE".
1487 * Name, if defined (which should be a buffer of at least 512 bytes long)
1488 * will be filled out to contain the read name.
1489 *
1490 * Returns mFILE containing trace on success
1491 * NULL on failure.
1492 */
srf_next_trace(srf_t * srf,char * name)1493 mFILE *srf_next_trace(srf_t *srf, char *name) {
1494 do {
1495 int type;
1496
1497 switch(type = srf_next_block_type(srf)) {
1498 case -1:
1499 /* EOF */
1500 return NULL;
1501
1502 case SRFB_NULL_INDEX: {
1503 /*
1504 * Maybe the last 8 bytes of a the file (or previously was
1505 * last 8 bytes prior to concatenating SRF files together).
1506 * If so it's the index length and should always be 8 zeros.
1507 */
1508 uint64_t ilen;
1509 if (1 != fread(&ilen, 8, 1, srf->fp))
1510 return NULL;
1511 if (ilen != 0)
1512 return NULL;
1513 break;
1514 }
1515
1516 case SRFB_CONTAINER:
1517 if (0 != srf_read_cont_hdr(srf, &srf->ch))
1518 return NULL;
1519 break;
1520
1521 case SRFB_XML:
1522 if (0 != srf_read_xml(srf, &srf->xml))
1523 return NULL;
1524 break;
1525
1526 case SRFB_TRACE_HEADER:
1527 if (0 != srf_read_trace_hdr(srf, &srf->th))
1528 return NULL;
1529
1530 break;
1531
1532 case SRFB_TRACE_BODY: {
1533 mFILE *mf = mfcreate(NULL, 0);
1534 srf_trace_body_t tb;
1535 tb.trace = NULL;
1536
1537 if (!mf || 0 != srf_read_trace_body(srf, &tb, 0))
1538 return NULL;
1539
1540 if (name) {
1541 if (-1 == construct_trace_name(srf->th.id_prefix,
1542 (unsigned char *)tb.read_id,
1543 tb.read_id_length,
1544 name, 512)) {
1545 return NULL;
1546 }
1547 }
1548
1549 if (srf->th.trace_hdr_size)
1550 mfwrite(srf->th.trace_hdr, 1, srf->th.trace_hdr_size, mf);
1551 if (tb.trace_size)
1552 mfwrite(tb.trace, 1, tb.trace_size, mf);
1553
1554 if (tb.trace)
1555 free(tb.trace);
1556
1557 mrewind(mf);
1558 return mf;
1559 }
1560
1561 case SRFB_INDEX: {
1562 off_t pos = ftell(srf->fp);
1563 srf_read_index_hdr(srf, &srf->hdr, 1);
1564
1565 /* Skip the index body */
1566 fseeko(srf->fp, pos + srf->hdr.size, SEEK_SET);
1567 break;
1568 }
1569
1570 default:
1571 fprintf(stderr, "Block of unknown type '%c'. Aborting\n", type);
1572 return NULL;
1573 }
1574 } while (1);
1575
1576 return NULL;
1577 }
1578
1579 /*
1580 * Decodes a partial ZTR file consisting of data in 'mf'.
1581 * Note that mf may contain a partial chunk, so we need to be careful on
1582 * error checking.
1583 *
1584 * If a ztr object is passed in (in 'z') then we assume we've already
1585 * loaded the ZTR header and get straight down to decoding the remaining
1586 * chunks. Otherwise we also decode the header.
1587 *
1588 * If no chunk is visible at all then we'll return NULL and rewind mf.
1589 * Otherwise we'll leave the file pointer at the start of the next
1590 * partial chunk (or EOF if none) and return the ztr_t pointer.
1591 */
partial_decode_ztr(srf_t * srf,mFILE * mf,ztr_t * z)1592 ztr_t *partial_decode_ztr(srf_t *srf, mFILE *mf, ztr_t *z) {
1593 ztr_t *ztr;
1594 ztr_chunk_t *chunk;
1595 long pos = 0;
1596
1597 if (z) {
1598 /* Use existing ZTR object => already loaded header */
1599 ztr = z;
1600
1601 } else {
1602 /* Allocate or use existing ztr */
1603 if (NULL == (ztr = new_ztr()))
1604 return NULL;
1605
1606 /* Read the header */
1607 if (-1 == ztr_read_header(mf, &ztr->header)) {
1608 if (!z)
1609 delete_ztr(ztr);
1610 mrewind(mf);
1611 return NULL;
1612 }
1613
1614 /* Check magic number and version */
1615 if (memcmp(ztr->header.magic, ZTR_MAGIC, 8) != 0) {
1616 if (!z)
1617 delete_ztr(ztr);
1618 mrewind(mf);
1619 return NULL;
1620 }
1621
1622 if (ztr->header.version_major != ZTR_VERSION_MAJOR) {
1623 if (!z)
1624 delete_ztr(ztr);
1625 mrewind(mf);
1626 return NULL;
1627 }
1628 }
1629
1630 /* Load chunks */
1631 pos = mftell(mf);
1632 while ((chunk = ztr_read_chunk_hdr(mf))) {
1633 chunk->data = (char *)xmalloc(chunk->dlength);
1634 if (chunk->dlength != mfread(chunk->data, 1, chunk->dlength, mf))
1635 break;
1636
1637 ztr->nchunks++;
1638 ztr->chunk = (ztr_chunk_t *)xrealloc(ztr->chunk, ztr->nchunks *
1639 sizeof(ztr_chunk_t));
1640 memcpy(&ztr->chunk[ztr->nchunks-1], chunk, sizeof(*chunk));
1641 xfree(chunk);
1642 pos = mftell(mf);
1643 }
1644
1645 /*
1646 * At this stage we're 'pos' into the mFILE mf with any remainder being
1647 * a partial block.
1648 */
1649 if (0 == ztr->nchunks) {
1650 if (!z)
1651 delete_ztr(ztr);
1652 mrewind(mf);
1653 return NULL;
1654 }
1655
1656 /* Ensure we exit at the start of a ztr CHUNK */
1657 mfseek(mf, pos, SEEK_SET);
1658
1659 /* If this is the header part, ensure we uncompress and init. data */
1660 if (!z) {
1661 /* Force caching of huffman code_sets */
1662 ztr_find_hcode(ztr, CODE_USER);
1663
1664 /* And uncompress the rest */
1665 uncompress_ztr(ztr);
1666 }
1667
1668 return ztr;
1669 }
1670
1671 /*
1672 * Creates a copy of ztr_t 'src' and returns it. The newly returned ztr_t
1673 * will consist of shared components where src and dest overlap, but freeing
1674 * dest will know what's appropriate to free and what is not.
1675 */
ztr_dup(ztr_t * src)1676 ztr_t *ztr_dup(ztr_t *src) {
1677 ztr_t *dest = new_ztr();
1678 int i;
1679
1680 if (!dest)
1681 return NULL;
1682
1683 /* Basics */
1684 *dest = *src;
1685
1686 /* Mirror chunks */
1687 dest->chunk = (ztr_chunk_t *)malloc(src->nchunks * sizeof(ztr_chunk_t));
1688 for (i = 0; i < src->nchunks; i++) {
1689 dest->chunk[i] = src->chunk[i];
1690 dest->chunk[i].ztr_owns = 0; /* src owns the data/meta_data */
1691 }
1692
1693 /* Mirror text_segments; no overlap here */
1694 dest->text_segments = (ztr_text_t *)malloc(src->ntext_segments *
1695 sizeof(ztr_text_t));
1696 for (i = 0; i < src->ntext_segments; i++) {
1697 dest->text_segments[i] = src->text_segments[i];
1698 }
1699
1700 /* huffman hcodes */
1701 dest->hcodes = (ztr_hcode_t *)malloc(src->nhcodes * sizeof(ztr_hcode_t));
1702 for (i = 0; i < src->nhcodes; i++) {
1703 dest->hcodes[i] = src->hcodes[i];
1704 dest->hcodes[i].ztr_owns = 0;
1705 }
1706
1707 return dest;
1708 }
1709
1710 /*
1711 * Fetches the next trace from an SRF container as a ZTR object.
1712 * This is more efficient than srf_next_trace() if we are serially
1713 * reading through many traces as we decode ZTR data less often and can
1714 * cache data from one trace to the next.
1715 *
1716 * Name, if defined (which should be a buffer of at least 512 bytes long)
1717 * will be filled out to contain the read name.
1718 *
1719 * filter_mask should consist of zero or more SRF_READ_FLAG_* bits.
1720 * Reads with one or more flags matching these bits will be skipped over.
1721 *
1722 * flags, if non-NULL, is filled out on exit to contain the SRF flags
1723 * from the Data Block structure.
1724 *
1725 * Returns ztr_t * on success
1726 * NULL on failure.
1727 */
srf_next_ztr_flags(srf_t * srf,char * name,int filter_mask,int * flags)1728 ztr_t *srf_next_ztr_flags(srf_t *srf, char *name, int filter_mask,
1729 int *flags) {
1730 do {
1731 int type;
1732
1733 switch(type = srf_next_block_type(srf)) {
1734 case -1:
1735 /* EOF */
1736 return NULL;
1737
1738 case SRFB_NULL_INDEX: {
1739 /*
1740 * Maybe the last 8 bytes of a the file (or previously was
1741 * last 8 bytes prior to concatenating SRF files together).
1742 * If so it's the index length and should always be 8 zeros.
1743 */
1744 uint64_t ilen;
1745 if (1 != fread(&ilen, 8, 1, srf->fp))
1746 return NULL;
1747 if (ilen != 0)
1748 return NULL;
1749 break;
1750 }
1751
1752 case SRFB_CONTAINER:
1753 if (0 != srf_read_cont_hdr(srf, &srf->ch))
1754 return NULL;
1755 break;
1756
1757 case SRFB_XML:
1758 if (0 != srf_read_xml(srf, &srf->xml))
1759 return NULL;
1760 break;
1761
1762 case SRFB_TRACE_HEADER:
1763 if (0 != srf_read_trace_hdr(srf, &srf->th))
1764 return NULL;
1765
1766 /* Decode ZTR chunks in the header */
1767 if (srf->mf)
1768 mfdestroy(srf->mf);
1769
1770 srf->mf = mfcreate(NULL, 0);
1771 if (srf->th.trace_hdr_size)
1772 mfwrite(srf->th.trace_hdr, 1, srf->th.trace_hdr_size, srf->mf);
1773 if (srf->ztr)
1774 delete_ztr(srf->ztr);
1775 mrewind(srf->mf);
1776
1777 if (NULL != (srf->ztr = partial_decode_ztr(srf, srf->mf, NULL))) {
1778 srf->mf_pos = mftell(srf->mf);
1779 } else {
1780 /* Maybe not enough to decode or no headerBlob. */
1781 /* So delay until decoding the body. */
1782 srf->mf_pos = 0;
1783 }
1784 mfseek(srf->mf, 0, SEEK_END);
1785 srf->mf_end = mftell(srf->mf);
1786
1787 break;
1788
1789 case SRFB_TRACE_BODY: {
1790 srf_trace_body_t tb;
1791 ztr_t *ztr_tmp;
1792
1793 if (!srf->mf || 0 != srf_read_trace_body(srf, &tb, 0))
1794 return NULL;
1795
1796 if (name) {
1797 if (-1 == construct_trace_name(srf->th.id_prefix,
1798 (unsigned char *)tb.read_id,
1799 tb.read_id_length,
1800 name, 512)) {
1801 return NULL;
1802 }
1803 }
1804
1805 mfseek(srf->mf, srf->mf_end, SEEK_SET);
1806 if (tb.trace_size) {
1807 mfwrite(tb.trace, 1, tb.trace_size, srf->mf);
1808 free(tb.trace);
1809 tb.trace = NULL;
1810 }
1811 mftruncate(srf->mf, mftell(srf->mf));
1812 mfseek(srf->mf, srf->mf_pos, SEEK_SET);
1813
1814 if (tb.flags & filter_mask) {
1815 break; /* Filtered, so skip it */
1816 } else {
1817 if (flags)
1818 *flags = tb.flags;
1819 if (srf->ztr)
1820 ztr_tmp = ztr_dup(srf->ztr); /* inefficient, but simple */
1821 else
1822 ztr_tmp = NULL;
1823
1824 return partial_decode_ztr(srf, srf->mf, ztr_tmp);
1825 }
1826 }
1827
1828 case SRFB_INDEX: {
1829 off_t pos = ftell(srf->fp);
1830 srf_read_index_hdr(srf, &srf->hdr, 1);
1831
1832 /* Skip the index body */
1833 fseeko(srf->fp, pos + srf->hdr.size, SEEK_SET);
1834 break;
1835 }
1836
1837 default:
1838 fprintf(stderr, "Block of unknown type '%c'. Aborting\n", type);
1839 return NULL;
1840 }
1841 } while (1);
1842
1843 return NULL;
1844 }
1845
srf_next_ztr(srf_t * srf,char * name,int filter_mask)1846 ztr_t *srf_next_ztr(srf_t *srf, char *name, int filter_mask) {
1847 return srf_next_ztr_flags(srf, name, filter_mask, NULL);
1848 }
1849
1850 /*
1851 * Returns the type of the next block.
1852 * -1 for none (EOF)
1853 */
srf_next_block_type(srf_t * srf)1854 int srf_next_block_type(srf_t *srf) {
1855 int c = fgetc(srf->fp);
1856 if (c == EOF)
1857 return -1;
1858
1859 ungetc(c, srf->fp);
1860
1861 return c;
1862 }
1863
1864 /*
1865 * Reads the next SRF block from an archive and returns the block type.
1866 * If the block is a trace it'll return the full trace name too (maximum
1867 * 512 bytes).
1868 *
1869 * Returns block type on success, writing to pos and name as appropriate
1870 * -1 on EOF
1871 * -2 on failure
1872 */
srf_next_block_details(srf_t * srf,uint64_t * pos,char * name)1873 int srf_next_block_details(srf_t *srf, uint64_t *pos, char *name) {
1874 int type;
1875 *pos = ftell(srf->fp);
1876
1877 switch(type = srf_next_block_type(srf)) {
1878 case -1:
1879 /* EOF */
1880 return -1;
1881
1882 case SRFB_NULL_INDEX: {
1883 /*
1884 * Maybe the last 8 bytes of a the file (or previously was
1885 * last 8 bytes prior to concatenating SRF files together).
1886 * If so it's the index length and should always be 8 zeros.
1887 */
1888 uint64_t ilen;
1889 if (1 != fread(&ilen, 8, 1, srf->fp))
1890 return -2;
1891 if (ilen != 0)
1892 return -2;
1893 break;
1894 }
1895
1896 case SRFB_CONTAINER:
1897 if (0 != srf_read_cont_hdr(srf, &srf->ch))
1898 return -2;
1899 break;
1900
1901 case SRFB_TRACE_HEADER:
1902 if (0 != srf_read_trace_hdr(srf, &srf->th))
1903 return -2;
1904
1905 break;
1906
1907 case SRFB_TRACE_BODY:
1908 /* Inefficient, but it'll do for testing purposes */
1909 if (0 != srf_read_trace_body(srf, &srf->tb, 1))
1910 return -2;
1911
1912 if (name) {
1913 if (-1 == construct_trace_name(srf->th.id_prefix,
1914 (unsigned char *)srf->tb.read_id,
1915 srf->tb.read_id_length,
1916 name, 512)) {
1917 return -2;
1918 }
1919 }
1920
1921 break;
1922
1923 case SRFB_INDEX:
1924 srf_read_index_hdr(srf, &srf->hdr, 1);
1925
1926 /* Skip the index body */
1927 fseeko(srf->fp, *pos + srf->hdr.size, SEEK_SET);
1928 break;
1929
1930
1931 default:
1932 fprintf(stderr, "Block of unknown type '%c'. Aborting\n", type);
1933 return -2;
1934 }
1935
1936 return type;
1937 }
1938
1939 /*
1940 * Searches through 'nitems' 8-byte values stored in 'srf' at file offset
1941 * 'start' onwards for the closest value <= 'query'.
1942 *
1943 * Returns 0 on success, setting *res
1944 * -1 on failure
1945 */
binary_scan(srf_t * srf,int nitems,uint64_t start,uint64_t query,uint64_t * res)1946 static int binary_scan(srf_t *srf, int nitems, uint64_t start, uint64_t query,
1947 uint64_t *res) {
1948 int min = 0;
1949 int max = nitems;
1950 int guess, i;
1951 uint64_t pos = 0, best = 0;
1952
1953 if (nitems <= 0)
1954 return -1;
1955
1956 /* Binary search on disk for approx location */
1957 while (max - min > 100) {
1958 guess = (max - min) / 2 + min;
1959
1960 if (guess == max)
1961 guess = max-1;
1962
1963 if (-1 == fseeko(srf->fp, guess * 8 + start, SEEK_SET))
1964 return -1;
1965 if (0 != srf_read_uint64(srf, &pos))
1966 return -1;
1967 if (pos > query) {
1968 max = guess;
1969 } else {
1970 min = guess;
1971 }
1972 }
1973
1974 /* Within a small distance => linear scan now to avoid needless disk IO */
1975 if (-1 == fseeko(srf->fp, min * 8 + start, SEEK_SET))
1976 return -1;
1977 for (i = min; i < max; i++) {
1978 if (0 != srf_read_uint64(srf, &pos))
1979 return -1;
1980 if (pos > query) {
1981 break;
1982 } else {
1983 best = pos;
1984 }
1985 }
1986
1987 assert(best <= query);
1988 *res = best;
1989
1990 return 0;
1991 }
1992
1993 /*
1994 * Searches in an SRF index for a trace of a given name.
1995 * If found it sets the file offsets for the container (cpos), data block
1996 * header (hpos) and data block (dpos).
1997 *
1998 * On a test with 2 containers and 12 headers this averaged at 6.1 reads per
1999 * trace fetch and 8.0 seeks.
2000 *
2001 * Returns 0 on success
2002 * -1 on failure (eg no index)
2003 * -2 on trace not found in index.
2004 */
srf_find_trace(srf_t * srf,char * tname,uint64_t * cpos,uint64_t * hpos,uint64_t * dpos)2005 int srf_find_trace(srf_t *srf, char *tname,
2006 uint64_t *cpos, uint64_t *hpos, uint64_t *dpos) {
2007 srf_index_hdr_t hdr;
2008 uint64_t hval, bnum;
2009 uint64_t bucket_pos;
2010 off_t ipos, skip;
2011 int item_sz = 8;
2012
2013 /* Check for valid index */
2014 if (0 != srf_read_index_hdr(srf, &hdr, 0)) {
2015 return -1;
2016 }
2017 ipos = ftello(srf->fp);
2018 skip = hdr.n_container * 8 + hdr.n_data_block_hdr * 8;
2019 if (hdr.dbh_pos_stored_sep)
2020 item_sz += 4;
2021
2022 /* Hash and load the bucket */
2023 hval = hash64(HASH_FUNC_JENKINS3, (unsigned char *)tname, strlen(tname));
2024 bnum = hval & (hdr.n_buckets - 1);
2025 if (-1 == fseeko(srf->fp, ipos + skip + bnum * 8, SEEK_SET))
2026 return -1;
2027
2028 if (0 != srf_read_uint64(srf, &bucket_pos))
2029 return -1;
2030 if (!bucket_pos)
2031 return -2;
2032
2033 /* Secondary hash is the top 7-bits */
2034 hval >>= 57;
2035
2036 /* Jump to the item list */
2037 if (-1 == fseeko(srf->fp, ipos-hdr.index_hdr_sz + bucket_pos, SEEK_SET))
2038 return -1;
2039 for (;;) {
2040 char name[1024];
2041 int h = fgetc(srf->fp);
2042 off_t saved_pos;
2043 uint64_t dbh_ind = 0;
2044
2045 if ((h & 0x7f) != hval) {
2046 if (h & 0x80)
2047 return -2; /* end of list and not found */
2048 /*
2049 * fseeko(srf->fp, 8, SEEK_CUR);
2050 * Use fread instead as it's likely already cached and linux
2051 * fseeko involves a real system call (lseek).
2052 */
2053 if (item_sz != fread(dpos, 1, item_sz, srf->fp))
2054 return -1;
2055 continue;
2056 }
2057
2058 /* Potential hit - investigate to see if it's the real one: */
2059 /* Seek to dpos and get trace id suffix. Compare to see if valid */
2060 if (0 != srf_read_uint64(srf, dpos))
2061 return -1;
2062 if (hdr.dbh_pos_stored_sep) {
2063 if (0 != srf_read_uint64(srf, &dbh_ind))
2064 return -1;
2065 }
2066 saved_pos = ftello(srf->fp);
2067 if (-1 == fseeko(srf->fp, (off_t)*dpos, SEEK_SET))
2068 return -1;
2069 if (0 != srf_read_trace_body(srf, &srf->tb, 0))
2070 return -1;
2071
2072 /* Identify the matching hpos (trace header) for this trace body */
2073 if (hdr.dbh_pos_stored_sep) {
2074 /* Hack for now - binary scan through 1 object */
2075 if (0 != binary_scan(srf, 1,
2076 ipos + hdr.n_container * 8 + dbh_ind * 8,
2077 *dpos, hpos))
2078 return -1;
2079 } else {
2080 if (0 != binary_scan(srf, hdr.n_data_block_hdr,
2081 ipos + hdr.n_container * 8,
2082 *dpos, hpos))
2083 return -1;
2084 }
2085
2086 /* Check the trace name matches */
2087 if (-1 == fseeko(srf->fp, *hpos, SEEK_SET))
2088 return -1;
2089 if (0 != srf_read_trace_hdr(srf, &srf->th))
2090 return -1;
2091
2092 if (-1 == construct_trace_name(srf->th.id_prefix,
2093 (unsigned char *)srf->tb.read_id,
2094 srf->tb.read_id_length,
2095 name, 1024))
2096 return -1;
2097
2098 if (strcmp(name, tname)) {
2099 /* Not found, continue with next item in list */
2100 if (h & 0x80)
2101 return -2;
2102 if (-1 == fseeko(srf->fp, saved_pos, SEEK_SET))
2103 return -1;
2104 continue;
2105 }
2106
2107 /* Matches, so fetch the container data and return out trace */
2108 if (0 != binary_scan(srf, hdr.n_container,
2109 ipos, *dpos, cpos))
2110 return -1;
2111
2112 /* FIXME: what to do with base-caller and cpos */
2113
2114 break;
2115 }
2116
2117 return 0;
2118 }
2119