1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <sysexits.h>
5 #include <xtend/string.h> // strlcpy() on Linux
6 #include <xtend/dsv.h>
7 #include <xtend/mem.h>
8 #include "sam.h"
9 #include "biolibc.h"
10
11 /***************************************************************************
12 * Library:
13 * #include <biolibc/sam.h>
14 * -lbiolibc -lxtend
15 *
16 * Description:
17 * Read next alignment (line) from a SAM stream.
18 *
19 * If field_mask is not BL_SAM_FIELD_ALL, fields not indicated by a 1
20 * in the bit mask are discarded rather than stored in sam_alignment.
21 * That field in the structure is then populated with an appropriate
22 * marker, such as '.'. Possible mask values are:
23 *
24 * BL_SAM_FIELD_ALL
25 * BL_SAM_FIELD_QNAME
26 * BL_SAM_FIELD_FLAG
27 * BL_SAM_FIELD_RNAME
28 * BL_SAM_FIELD_POS
29 * BL_SAM_FIELD_MAPQ
30 * BL_SAM_FIELD_CIGAR
31 * BL_SAM_FIELD_RNEXT
32 * BL_SAM_FIELD_PNEXT
33 * BL_SAM_FIELD_TLEN
34 * BL_SAM_FIELD_SEQ
35 * BL_SAM_FIELD_QUAL
36 *
37 * Arguments:
38 * sam_stream A FILE stream from which to read the line
39 * sam_alignment Pointer to a bl_sam_t structure
40 * field_mask Bit mask indicating which fields to store in sam_alignment
41 *
42 * Returns:
43 * BL_READ_OK on successful read
44 * BL_READ_EOF if EOF is encountered after a complete feature
45 * BL_READ_TRUNCATED if EOF or bad data is encountered elsewhere
46 *
47 * Examples:
48 * bl_sam_read(stdin, &sam_alignment, BL_SAM_FIELD_ALL);
49 * bl_sam_read(sam_stream, &sam_alignment,
50 * BL_SAM_FIELD_QNAME|BL_SAM_FIELD_POS|BL_SAM_FIELD_TLEN);
51 *
52 * See also:
53 * bl_sam_write(3)
54 *
55 * History:
56 * Date Name Modification
57 * 2019-12-09 Jason Bacon Begin
58 ***************************************************************************/
59
bl_sam_read(FILE * sam_stream,bl_sam_t * sam_alignment,sam_field_mask_t field_mask)60 int bl_sam_read(FILE *sam_stream, bl_sam_t *sam_alignment,
61 sam_field_mask_t field_mask)
62
63 {
64 char mapq_str[BL_SAM_MAPQ_MAX_CHARS + 1],
65 temp_seq_or_qual[BL_SAM_SEQ_MAX_CHARS + 1],
66 pos_str[BL_POSITION_MAX_DIGITS + 1],
67 flag_str[BL_SAM_FLAG_MAX_DIGITS + 1],
68 *end;
69 size_t len;
70 static uint64_t previous_pos = 0;
71 int delim;
72
73 if ( field_mask & BL_SAM_FIELD_QNAME )
74 delim = tsv_read_field(sam_stream, sam_alignment->qname,
75 BL_SAM_QNAME_MAX_CHARS, &len);
76 else
77 {
78 delim = tsv_skip_field(sam_stream);
79 *sam_alignment->qname = '\0';
80 }
81 if ( delim == EOF )
82 return BL_READ_EOF;
83
84 // 2 Flag
85 if ( field_mask & BL_SAM_FIELD_FLAG )
86 delim = tsv_read_field(sam_stream, flag_str, BL_SAM_FLAG_MAX_DIGITS, &len);
87 else
88 delim = tsv_skip_field(sam_stream);
89 if ( delim == EOF )
90 {
91 fprintf(stderr, "bl_sam_read(): Got EOF reading flag: %s.\n",
92 flag_str);
93 return BL_READ_TRUNCATED;
94 }
95 if ( field_mask & BL_SAM_FIELD_FLAG )
96 {
97 sam_alignment->flag = strtoul(flag_str, &end, 10);
98 if ( *end != '\0' )
99 {
100 fprintf(stderr, "bl_sam_read(): Invalid position: %s\n",
101 flag_str);
102 fprintf(stderr, "qname = %s rname = %s\n",
103 sam_alignment->qname, sam_alignment->rname);
104 fprintf(stderr, "previous_pos = %" PRIu64 "\n", previous_pos);
105 exit(EX_DATAERR);
106 }
107 }
108 else
109 sam_alignment->flag = 0; // FIXME: Is there a better choice?
110
111 // 3 RNAME
112 if ( field_mask & BL_SAM_FIELD_RNAME )
113 delim = tsv_read_field(sam_stream, sam_alignment->rname,
114 BL_SAM_RNAME_MAX_CHARS, &len);
115 else
116 {
117 delim = tsv_skip_field(sam_stream);
118 *sam_alignment->rname = '\0';
119 }
120 if ( delim == EOF )
121 {
122 fprintf(stderr, "bl_sam_read(): Got EOF reading rname: %s.\n",
123 sam_alignment->rname);
124 return BL_READ_TRUNCATED;
125 }
126
127 // 4 POS
128 if ( field_mask & BL_SAM_FIELD_POS )
129 delim = tsv_read_field(sam_stream, pos_str, BL_POSITION_MAX_DIGITS,
130 &len);
131 else
132 delim = tsv_skip_field(sam_stream);
133 if ( delim == EOF )
134 {
135 fprintf(stderr, "bl_sam_read(): Got EOF reading pos: %s.\n",
136 pos_str);
137 return BL_READ_TRUNCATED;
138 }
139 if ( field_mask & BL_SAM_FIELD_POS )
140 {
141 sam_alignment->pos = strtoul(pos_str, &end, 10);
142 if ( *end != '\0' )
143 {
144 fprintf(stderr, "bl_sam_read(): Invalid position: %s\n",
145 pos_str);
146 fprintf(stderr, "qname = %s rname = %s\n",
147 sam_alignment->qname, sam_alignment->rname);
148 fprintf(stderr, "previous_pos = %" PRIu64 "\n", previous_pos);
149 exit(EX_DATAERR);
150 }
151 previous_pos = sam_alignment->pos;
152 }
153 else
154 sam_alignment->pos = 0;
155
156 // 5 MAPQ
157 if ( field_mask & BL_SAM_FIELD_MAPQ )
158 delim = tsv_read_field(sam_stream, mapq_str, BL_SAM_MAPQ_MAX_CHARS,
159 &len);
160 else
161 delim = tsv_skip_field(sam_stream);
162 if ( delim == EOF )
163 {
164 fprintf(stderr, "bl_sam_read(): Got EOF reading mapq: %s.\n",
165 mapq_str);
166 return BL_READ_TRUNCATED;
167 }
168
169 if ( field_mask & BL_SAM_FIELD_MAPQ )
170 {
171 sam_alignment->mapq = strtoul(mapq_str, &end, 10);
172 if ( *end != '\0' )
173 {
174 fprintf(stderr, "bl_sam_read(): Invalid mapq: %s\n",
175 mapq_str);
176 fprintf(stderr, "qname = %s rname = %s\n",
177 sam_alignment->qname, sam_alignment->rname);
178 fprintf(stderr, "previous_pos = %" PRIu64 "\n", previous_pos);
179 exit(EX_DATAERR);
180 }
181 }
182 else
183 sam_alignment->mapq = 0;
184
185 // 6 CIGAR
186 if ( field_mask & BL_SAM_FIELD_CIGAR )
187 delim = tsv_read_field(sam_stream, sam_alignment->cigar,
188 BL_SAM_CIGAR_MAX_CHARS, &len);
189 else
190 {
191 delim = tsv_skip_field(sam_stream);
192 *sam_alignment->cigar = '\0';
193 }
194 if ( delim == EOF )
195 {
196 fprintf(stderr, "bl_sam_read(): Got EOF reading cigar: %s.\n",
197 sam_alignment->cigar);
198 return BL_READ_TRUNCATED;
199 }
200
201 // 7 RNEXT
202 if ( field_mask & BL_SAM_FIELD_RNEXT )
203 delim = tsv_read_field(sam_stream, sam_alignment->rnext,
204 BL_SAM_RNAME_MAX_CHARS, &len);
205 else
206 {
207 delim = tsv_skip_field(sam_stream);
208 *sam_alignment->rnext = '\0';
209 }
210 if ( delim == EOF )
211 {
212 fprintf(stderr, "bl_sam_read(): Got EOF reading rnext: %s.\n",
213 sam_alignment->rnext);
214 return BL_READ_TRUNCATED;
215 }
216
217 // 8 PNEXT
218 if ( field_mask & BL_SAM_FIELD_PNEXT )
219 delim = tsv_read_field(sam_stream, pos_str, BL_POSITION_MAX_DIGITS,
220 &len);
221 else
222 delim = tsv_skip_field(sam_stream);
223 if ( delim == EOF )
224 {
225 fprintf(stderr, "bl_sam_read(): Got EOF reading pnext: %s.\n",
226 pos_str);
227 return BL_READ_TRUNCATED;
228 }
229 if ( field_mask & BL_SAM_FIELD_PNEXT )
230 {
231 sam_alignment->pnext = strtoul(pos_str, &end, 10);
232 if ( *end != '\0' )
233 {
234 fprintf(stderr, "bl_sam_read(): Invalid pnext: %s\n",
235 pos_str);
236 fprintf(stderr, "qname = %s rname = %s\n",
237 sam_alignment->qname, sam_alignment->rname);
238 fprintf(stderr, "previous_pos = %" PRIu64 "\n", previous_pos);
239 exit(EX_DATAERR);
240 }
241 }
242 else
243 sam_alignment->pnext = 0;
244
245 // 9 TLEN
246 if ( field_mask & BL_SAM_FIELD_TLEN )
247 delim = tsv_read_field(sam_stream, pos_str, BL_POSITION_MAX_DIGITS,
248 &len);
249 else
250 delim = tsv_skip_field(sam_stream);
251 if ( delim == EOF )
252 {
253 fprintf(stderr, "bl_sam_read(): Got EOF reading tlen: %s.\n",
254 pos_str);
255 return BL_READ_TRUNCATED;
256 }
257 if ( field_mask & BL_SAM_FIELD_TLEN )
258 {
259 sam_alignment->tlen = strtoul(pos_str, &end, 10);
260 if ( *end != '\0' )
261 {
262 fprintf(stderr, "bl_sam_read(): Invalid tlen: %s\n",
263 pos_str);
264 fprintf(stderr, "qname = %s rname = %s\n",
265 sam_alignment->qname, sam_alignment->rname);
266 fprintf(stderr, "previous_pos = %" PRIu64 "\n", previous_pos);
267 exit(EX_DATAERR);
268 }
269 }
270 else
271 sam_alignment->tlen = 0;
272
273 // 10 SEQ
274 if ( field_mask & BL_SAM_FIELD_SEQ )
275 delim = tsv_read_field(sam_stream, temp_seq_or_qual, BL_SAM_SEQ_MAX_CHARS,
276 &sam_alignment->seq_len);
277 else
278 {
279 delim = tsv_skip_field(sam_stream);
280 // sam_alignment->seq = NULL; Mem leak if allocated elsewhere
281 }
282 if ( delim == EOF )
283 {
284 fprintf(stderr, "bl_sam_read(): Got EOF reading seq: %s.\n",
285 temp_seq_or_qual);
286 return BL_READ_TRUNCATED;
287 }
288
289 if ( field_mask & BL_SAM_FIELD_SEQ )
290 {
291 // May be allocated by bl_sam_init() or bl_sam_copy()
292 if ( sam_alignment->seq == NULL )
293 {
294 if ( (sam_alignment->seq = xt_malloc(sam_alignment->seq_len + 1,
295 sizeof(*sam_alignment->seq))) == NULL )
296 {
297 fprintf(stderr, "bl_sam_read(): Could not allocate seq.\n");
298 exit(EX_UNAVAILABLE);
299 }
300 }
301 memcpy(sam_alignment->seq, temp_seq_or_qual, sam_alignment->seq_len + 1);
302 }
303
304 // 11 QUAL, should be last field
305 if ( field_mask & BL_SAM_FIELD_QUAL )
306 delim = tsv_read_field(sam_stream, temp_seq_or_qual, BL_SAM_SEQ_MAX_CHARS,
307 &sam_alignment->qual_len);
308 else
309 {
310 delim = tsv_skip_field(sam_stream);
311 // sam_alignment->qual = NULL; Mem leak if allocated elsewhere
312 }
313 if ( delim == EOF )
314 {
315 fprintf(stderr, "bl_sam_read(): Got EOF reading qual: %s.\n",
316 temp_seq_or_qual);
317 return BL_READ_TRUNCATED;
318 }
319
320 if ( field_mask & BL_SAM_FIELD_QUAL )
321 {
322 // May be allocated by bl_sam_init() or bl_sam_copy()
323 if ( sam_alignment->qual == NULL )
324 {
325 if ( (sam_alignment->qual = xt_malloc(sam_alignment->qual_len + 1,
326 sizeof(*sam_alignment->qual))) == NULL )
327 {
328 fprintf(stderr, "bl_sam_read(): Could not allocate qual.\n");
329 exit(EX_UNAVAILABLE);
330 }
331 }
332 memcpy(sam_alignment->qual, temp_seq_or_qual,
333 sam_alignment->qual_len + 1);
334
335 if ( (sam_alignment->qual_len != 1) &&
336 (sam_alignment->seq_len != sam_alignment->qual_len) )
337 fprintf(stderr, "bl_sam_read(): Warning: qual_len != seq_len for %s,%" PRIu64 "\n",
338 sam_alignment->rname, sam_alignment->pos);
339 }
340
341 // Some SRA CRAMs have 11 fields, most have 12
342 // Discard everything after the 11th
343 if ( delim == '\t' )
344 while ( getc(sam_stream) != '\n' )
345 ;
346
347 /*fprintf(stderr,"bl_sam_read(): %s,%" PRIu64 ",%zu\n",
348 BL_SAM_RNAME(sam_alignment), BL_SAM_POS(sam_alignment),
349 BL_SAM_SEQ_LEN(sam_alignment));*/
350
351 return BL_READ_OK;
352 }
353
354
355 /***************************************************************************
356 * Library:
357 * #include <biolibc/sam.h>
358 * -lbiolibc -lxtend
359 *
360 * Description:
361 * Copy a SAM alignment as efficiently as possible, allocating memory
362 * as needed.
363 *
364 * Arguments:
365 * dest Pointer to bl_sam_t structure to receive copy
366 * src Pointer to bl_sam_t structure to be copied
367 *
368 * See also:
369 * bl_sam_read(3), bl_sam_init(3), bl_sam_free(3)
370 *
371 * History:
372 * Date Name Modification
373 * 2020-05-27 Jason Bacon Begin
374 ***************************************************************************/
375
bl_sam_copy(bl_sam_t * dest,bl_sam_t * src)376 void bl_sam_copy(bl_sam_t *dest, bl_sam_t *src)
377
378 {
379 strlcpy(dest->qname, src->qname, BL_SAM_QNAME_MAX_CHARS + 1);
380 dest->flag = src->flag;
381 strlcpy(dest->rname, src->rname, BL_SAM_RNAME_MAX_CHARS + 1);
382 dest->pos = src->pos;
383 dest->mapq = src->mapq;
384 // FIXME: Add cigar and RNEXT
385 strlcpy(dest->cigar, src->cigar, BL_SAM_CIGAR_MAX_CHARS + 1);
386 strlcpy(dest->rnext, src->rnext, BL_SAM_RNAME_MAX_CHARS + 1);
387 dest->pnext = src->pnext;
388 dest->tlen = src->tlen;
389
390 if ( (dest->seq = xt_malloc(src->seq_len + 1,
391 sizeof(*dest->seq))) == NULL )
392 {
393 fprintf(stderr, "bl_sam_copy(): Could not allocate seq.\n");
394 exit(EX_UNAVAILABLE);
395 }
396 memcpy(dest->seq, src->seq, src->seq_len + 1);
397
398 if ( (dest->qual = xt_malloc(src->seq_len + 1,
399 sizeof(*dest->qual))) == NULL )
400 {
401 fprintf(stderr, "bl_sam_copy(): Could not allocate qual.\n");
402 exit(EX_UNAVAILABLE);
403 }
404
405 /* qual is an optional field */
406 if ( src->qual_len > 0 )
407 memcpy(dest->qual, src->qual, src->qual_len + 1);
408
409 dest->seq_len = src->seq_len;
410 dest->qual_len = src->qual_len;
411 }
412
413
414 /***************************************************************************
415 * Library:
416 * #include <biolibc/sam.h>
417 * -lbiolibc -lxtend
418 *
419 * Description:
420 * Free memory allocated by bl_sam_read() or
421 * bl_sam_init().
422 *
423 * Arguments:
424 * sam_alignment Pointer to bl_sam_t structure to be freed.
425 *
426 * See also:
427 * bl_sam_read(3), bl_sam_init(3), bl_sam_copy(3)
428 *
429 * History:
430 * Date Name Modification
431 * 2020-05-29 Jason Bacon Begin
432 ***************************************************************************/
433
bl_sam_free(bl_sam_t * sam_alignment)434 void bl_sam_free(bl_sam_t *sam_alignment)
435
436 {
437 if ( sam_alignment->seq != NULL )
438 free(sam_alignment->seq);
439 if ( sam_alignment->qual != NULL )
440 free(sam_alignment->qual);
441 // FIXME: Cigar and rnext?
442 }
443
444
445 /***************************************************************************
446 * Library:
447 * #include <biolibc/sam.h>
448 * -lbiolibc -lxtend
449 *
450 * Description:
451 * Initialize a bl_sam_t structure, allocating memory for
452 * sequence and quality strings according to seq_len. Passing a
453 * seq_len of 0 prevents memory allocation from occurring.
454 *
455 * Only BL_SAM_FIELD_SEQ and BL_SAM_FIELD_QUAL are meaningful bits in
456 * field_mask, as they determine whether memory is allocated. All
457 * other fields are unconditionally initialized to 0, NULL, or blank.
458 *
459 * Arguments:
460 * sam_alignment Pointer to bl_sam_t structure to initialize
461 * seq_len Length of sequence and quality strings
462 * field_mask Bit mask indicating which fields will be used
463 *
464 * See also:
465 * bl_sam_read(3), bl_sam_free(3), bl_sam_copy(3)
466 *
467 * History:
468 * Date Name Modification
469 * 2020-05-29 Jason Bacon Begin
470 ***************************************************************************/
471
bl_sam_init(bl_sam_t * sam_alignment,size_t seq_len,sam_field_mask_t field_mask)472 void bl_sam_init(bl_sam_t *sam_alignment, size_t seq_len,
473 sam_field_mask_t field_mask)
474
475 {
476 *sam_alignment->qname = '\0';
477 sam_alignment->flag = 0;
478 *sam_alignment->rname = '\0';
479 sam_alignment->pos = 0;
480 sam_alignment->mapq = 0;
481 *sam_alignment->cigar = '\0';
482 *sam_alignment->rnext = '\0';
483 sam_alignment->pnext = 0;
484 sam_alignment->tlen = 0;
485 if ( seq_len == 0 )
486 {
487 sam_alignment->seq = NULL;
488 sam_alignment->qual = NULL;
489 }
490 else
491 {
492 if ( seq_len != 0 )
493 {
494 if ( (field_mask & BL_SAM_FIELD_SEQ) &&
495 ((sam_alignment->seq = xt_malloc(seq_len + 1,
496 sizeof(*sam_alignment->seq))) == NULL) )
497 {
498 fprintf(stderr, "bl_sam_init(): Could not allocate seq.\n");
499 exit(EX_UNAVAILABLE);
500 }
501 if ( (field_mask & BL_SAM_FIELD_QUAL) &&
502 ((sam_alignment->qual = xt_malloc(seq_len + 1,
503 sizeof(*sam_alignment->qual))) == NULL) )
504 {
505 fprintf(stderr, "bl_sam_init(): Could not allocate qual.\n");
506 exit(EX_UNAVAILABLE);
507 }
508 }
509 }
510 sam_alignment->seq_len = seq_len;
511 }
512
513
514 /***************************************************************************
515 * Library:
516 * #include <biolibc/sam.h>
517 * -lbiolibc -lxtend
518 *
519 * Description:
520 * Write an alignment (line) to a SAM stream.
521 *
522 * If field_mask is not BL_SAM_FIELD_ALL, fields not indicated by a 1
523 * in the bit mask are written as an appropriate placeholder such as '.'
524 * rather than stored in sam_alignment. Possible mask values are:
525 *
526 * BL_SAM_FIELD_ALL
527 * BL_SAM_FIELD_QNAME
528 * BL_SAM_FIELD_FLAG
529 * BL_SAM_FIELD_RNAME
530 * BL_SAM_FIELD_POS
531 * BL_SAM_FIELD_MAPQ
532 * BL_SAM_FIELD_CIGAR
533 * BL_SAM_FIELD_RNEXT
534 * BL_SAM_FIELD_PNEXT
535 * BL_SAM_FIELD_TLEN
536 * BL_SAM_FIELD_SEQ
537 * BL_SAM_FIELD_QUAL
538 *
539 * Arguments:
540 * sam_stream A FILE stream to which to write the line
541 * sam_alignment Pointer to a bl_sam_t structure
542 * field_mask Bit mask indicating which fields to store in sam_alignment
543 *
544 * Returns:
545 * Number of items written (per fprintf() output)
546 *
547 * See also:
548 * bl_sam_read(3)
549 *
550 * History:
551 * Date Name Modification
552 * 2019-12-09 Jason Bacon Begin
553 ***************************************************************************/
554
bl_sam_write(FILE * sam_stream,bl_sam_t * sam_alignment,sam_field_mask_t field_mask)555 int bl_sam_write(FILE *sam_stream, bl_sam_t *sam_alignment,
556 sam_field_mask_t field_mask)
557
558 {
559 int count;
560
561 count = fprintf(sam_stream, "%s\t%u\t%s\t%" PRIu64
562 "\t%u\t%s\t%s\t%" PRIu64 "zu\t%zu\t%s\t%s\t%zu\t%zu\n",
563 sam_alignment->qname,
564 sam_alignment->flag,
565 sam_alignment->rname,
566 sam_alignment->pos,
567 sam_alignment->mapq,
568 sam_alignment->cigar,
569 sam_alignment->rnext,
570 sam_alignment->pnext,
571 sam_alignment->tlen,
572 sam_alignment->seq,
573 sam_alignment->qual,
574 sam_alignment->seq_len,
575 sam_alignment->qual_len);
576 return count;
577 }
578