1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <sysexits.h>
5 #include <xtend/string.h>      // strlcpy() on Linux
6 #include <xtend/dsv.h>
7 #include <xtend/mem.h>
8 #include "sam.h"
9 #include "biolibc.h"
10 
11 /***************************************************************************
12  *  Library:
13  *      #include <biolibc/sam.h>
14  *      -lbiolibc -lxtend
15  *
16  *  Description:
17  *      Read next alignment (line) from a SAM stream.
18  *
19  *      If field_mask is not BL_SAM_FIELD_ALL, fields not indicated by a 1
20  *      in the bit mask are discarded rather than stored in sam_alignment.
21  *      That field in the structure is then populated with an appropriate
22  *      marker, such as '.'.  Possible mask values are:
23  *
24  *      BL_SAM_FIELD_ALL
25  *      BL_SAM_FIELD_QNAME
26  *      BL_SAM_FIELD_FLAG
27  *      BL_SAM_FIELD_RNAME
28  *      BL_SAM_FIELD_POS
29  *      BL_SAM_FIELD_MAPQ
30  *      BL_SAM_FIELD_CIGAR
31  *      BL_SAM_FIELD_RNEXT
32  *      BL_SAM_FIELD_PNEXT
33  *      BL_SAM_FIELD_TLEN
34  *      BL_SAM_FIELD_SEQ
35  *      BL_SAM_FIELD_QUAL
36  *
37  *  Arguments:
38  *      sam_stream      A FILE stream from which to read the line
39  *      sam_alignment   Pointer to a bl_sam_t structure
40  *      field_mask      Bit mask indicating which fields to store in sam_alignment
41  *
42  *  Returns:
43  *      BL_READ_OK on successful read
44  *      BL_READ_EOF if EOF is encountered after a complete feature
45  *      BL_READ_TRUNCATED if EOF or bad data is encountered elsewhere
46  *
47  *  Examples:
48  *      bl_sam_read(stdin, &sam_alignment, BL_SAM_FIELD_ALL);
49  *      bl_sam_read(sam_stream, &sam_alignment,
50  *                         BL_SAM_FIELD_QNAME|BL_SAM_FIELD_POS|BL_SAM_FIELD_TLEN);
51  *
52  *  See also:
53  *      bl_sam_write(3)
54  *
55  *  History:
56  *  Date        Name        Modification
57  *  2019-12-09  Jason Bacon Begin
58  ***************************************************************************/
59 
bl_sam_read(FILE * sam_stream,bl_sam_t * sam_alignment,sam_field_mask_t field_mask)60 int     bl_sam_read(FILE *sam_stream, bl_sam_t *sam_alignment,
61 			   sam_field_mask_t field_mask)
62 
63 {
64     char    mapq_str[BL_SAM_MAPQ_MAX_CHARS + 1],
65 	    temp_seq_or_qual[BL_SAM_SEQ_MAX_CHARS + 1],
66 	    pos_str[BL_POSITION_MAX_DIGITS + 1],
67 	    flag_str[BL_SAM_FLAG_MAX_DIGITS + 1],
68 	    *end;
69     size_t  len;
70     static uint64_t   previous_pos = 0;
71     int     delim;
72 
73     if ( field_mask & BL_SAM_FIELD_QNAME )
74 	delim = tsv_read_field(sam_stream, sam_alignment->qname,
75 			BL_SAM_QNAME_MAX_CHARS, &len);
76     else
77     {
78 	delim = tsv_skip_field(sam_stream);
79 	*sam_alignment->qname = '\0';
80     }
81     if ( delim == EOF )
82 	return BL_READ_EOF;
83 
84     // 2 Flag
85     if ( field_mask & BL_SAM_FIELD_FLAG )
86 	delim = tsv_read_field(sam_stream, flag_str, BL_SAM_FLAG_MAX_DIGITS, &len);
87     else
88 	delim = tsv_skip_field(sam_stream);
89     if ( delim == EOF )
90     {
91 	fprintf(stderr, "bl_sam_read(): Got EOF reading flag: %s.\n",
92 		flag_str);
93 	return BL_READ_TRUNCATED;
94     }
95     if ( field_mask & BL_SAM_FIELD_FLAG )
96     {
97 	sam_alignment->flag = strtoul(flag_str, &end, 10);
98 	if ( *end != '\0' )
99 	{
100 	    fprintf(stderr, "bl_sam_read(): Invalid position: %s\n",
101 		    flag_str);
102 	    fprintf(stderr, "qname = %s rname = %s\n",
103 		    sam_alignment->qname, sam_alignment->rname);
104 	    fprintf(stderr, "previous_pos = %" PRIu64 "\n", previous_pos);
105 	    exit(EX_DATAERR);
106 	}
107     }
108     else
109 	sam_alignment->flag = 0;    // FIXME: Is there a better choice?
110 
111     // 3 RNAME
112     if ( field_mask & BL_SAM_FIELD_RNAME )
113 	delim = tsv_read_field(sam_stream, sam_alignment->rname,
114 			       BL_SAM_RNAME_MAX_CHARS, &len);
115     else
116     {
117 	delim = tsv_skip_field(sam_stream);
118 	*sam_alignment->rname = '\0';
119     }
120     if ( delim == EOF )
121     {
122 	fprintf(stderr, "bl_sam_read(): Got EOF reading rname: %s.\n",
123 		sam_alignment->rname);
124 	return BL_READ_TRUNCATED;
125     }
126 
127     // 4 POS
128     if ( field_mask & BL_SAM_FIELD_POS )
129 	delim = tsv_read_field(sam_stream, pos_str, BL_POSITION_MAX_DIGITS,
130 			       &len);
131     else
132 	delim = tsv_skip_field(sam_stream);
133     if ( delim == EOF )
134     {
135 	fprintf(stderr, "bl_sam_read(): Got EOF reading pos: %s.\n",
136 		pos_str);
137 	return BL_READ_TRUNCATED;
138     }
139     if ( field_mask & BL_SAM_FIELD_POS )
140     {
141 	sam_alignment->pos = strtoul(pos_str, &end, 10);
142 	if ( *end != '\0' )
143 	{
144 	    fprintf(stderr, "bl_sam_read(): Invalid position: %s\n",
145 		    pos_str);
146 	    fprintf(stderr, "qname = %s rname = %s\n",
147 		    sam_alignment->qname, sam_alignment->rname);
148 	    fprintf(stderr, "previous_pos = %" PRIu64 "\n", previous_pos);
149 	    exit(EX_DATAERR);
150 	}
151 	previous_pos = sam_alignment->pos;
152     }
153     else
154 	sam_alignment->pos = 0;
155 
156     // 5 MAPQ
157     if ( field_mask & BL_SAM_FIELD_MAPQ )
158 	delim = tsv_read_field(sam_stream, mapq_str, BL_SAM_MAPQ_MAX_CHARS,
159 			       &len);
160     else
161 	delim = tsv_skip_field(sam_stream);
162     if ( delim == EOF )
163     {
164 	fprintf(stderr, "bl_sam_read(): Got EOF reading mapq: %s.\n",
165 		mapq_str);
166 	return BL_READ_TRUNCATED;
167     }
168 
169     if ( field_mask & BL_SAM_FIELD_MAPQ )
170     {
171 	sam_alignment->mapq = strtoul(mapq_str, &end, 10);
172 	if ( *end != '\0' )
173 	{
174 	    fprintf(stderr, "bl_sam_read(): Invalid mapq: %s\n",
175 		    mapq_str);
176 	    fprintf(stderr, "qname = %s rname = %s\n",
177 		    sam_alignment->qname, sam_alignment->rname);
178 	    fprintf(stderr, "previous_pos = %" PRIu64 "\n", previous_pos);
179 	    exit(EX_DATAERR);
180 	}
181     }
182     else
183 	sam_alignment->mapq = 0;
184 
185     // 6 CIGAR
186     if ( field_mask & BL_SAM_FIELD_CIGAR )
187 	delim = tsv_read_field(sam_stream, sam_alignment->cigar,
188 			       BL_SAM_CIGAR_MAX_CHARS, &len);
189     else
190     {
191 	delim = tsv_skip_field(sam_stream);
192 	*sam_alignment->cigar = '\0';
193     }
194     if ( delim == EOF )
195     {
196 	fprintf(stderr, "bl_sam_read(): Got EOF reading cigar: %s.\n",
197 		sam_alignment->cigar);
198 	return BL_READ_TRUNCATED;
199     }
200 
201     // 7 RNEXT
202     if ( field_mask & BL_SAM_FIELD_RNEXT )
203 	delim = tsv_read_field(sam_stream, sam_alignment->rnext,
204 			       BL_SAM_RNAME_MAX_CHARS, &len);
205     else
206     {
207 	delim = tsv_skip_field(sam_stream);
208 	*sam_alignment->rnext = '\0';
209     }
210     if ( delim == EOF )
211     {
212 	fprintf(stderr, "bl_sam_read(): Got EOF reading rnext: %s.\n",
213 		sam_alignment->rnext);
214 	return BL_READ_TRUNCATED;
215     }
216 
217     // 8 PNEXT
218     if ( field_mask & BL_SAM_FIELD_PNEXT )
219 	delim = tsv_read_field(sam_stream, pos_str, BL_POSITION_MAX_DIGITS,
220 			       &len);
221     else
222 	delim = tsv_skip_field(sam_stream);
223     if ( delim == EOF )
224     {
225 	fprintf(stderr, "bl_sam_read(): Got EOF reading pnext: %s.\n",
226 		pos_str);
227 	return BL_READ_TRUNCATED;
228     }
229     if ( field_mask & BL_SAM_FIELD_PNEXT )
230     {
231 	sam_alignment->pnext = strtoul(pos_str, &end, 10);
232 	if ( *end != '\0' )
233 	{
234 	    fprintf(stderr, "bl_sam_read(): Invalid pnext: %s\n",
235 		    pos_str);
236 	    fprintf(stderr, "qname = %s rname = %s\n",
237 		    sam_alignment->qname, sam_alignment->rname);
238 	    fprintf(stderr, "previous_pos = %" PRIu64 "\n", previous_pos);
239 	    exit(EX_DATAERR);
240 	}
241     }
242     else
243 	sam_alignment->pnext = 0;
244 
245     // 9 TLEN
246     if ( field_mask & BL_SAM_FIELD_TLEN )
247 	delim = tsv_read_field(sam_stream, pos_str, BL_POSITION_MAX_DIGITS,
248 			       &len);
249     else
250 	delim = tsv_skip_field(sam_stream);
251     if ( delim == EOF )
252     {
253 	fprintf(stderr, "bl_sam_read(): Got EOF reading tlen: %s.\n",
254 		pos_str);
255 	return BL_READ_TRUNCATED;
256     }
257     if ( field_mask & BL_SAM_FIELD_TLEN )
258     {
259 	sam_alignment->tlen = strtoul(pos_str, &end, 10);
260 	if ( *end != '\0' )
261 	{
262 	    fprintf(stderr, "bl_sam_read(): Invalid tlen: %s\n",
263 		    pos_str);
264 	    fprintf(stderr, "qname = %s rname = %s\n",
265 		    sam_alignment->qname, sam_alignment->rname);
266 	    fprintf(stderr, "previous_pos = %" PRIu64 "\n", previous_pos);
267 	    exit(EX_DATAERR);
268 	}
269     }
270     else
271 	sam_alignment->tlen = 0;
272 
273     // 10 SEQ
274     if ( field_mask & BL_SAM_FIELD_SEQ )
275 	delim = tsv_read_field(sam_stream, temp_seq_or_qual, BL_SAM_SEQ_MAX_CHARS,
276 			       &sam_alignment->seq_len);
277     else
278     {
279 	delim = tsv_skip_field(sam_stream);
280 	// sam_alignment->seq = NULL;   Mem leak if allocated elsewhere
281     }
282     if ( delim == EOF )
283     {
284 	fprintf(stderr, "bl_sam_read(): Got EOF reading seq: %s.\n",
285 		temp_seq_or_qual);
286 	return BL_READ_TRUNCATED;
287     }
288 
289     if ( field_mask & BL_SAM_FIELD_SEQ )
290     {
291 	// May be allocated by bl_sam_init() or bl_sam_copy()
292 	if ( sam_alignment->seq == NULL )
293 	{
294 	    if ( (sam_alignment->seq = xt_malloc(sam_alignment->seq_len + 1,
295 		    sizeof(*sam_alignment->seq))) == NULL )
296 	    {
297 		fprintf(stderr, "bl_sam_read(): Could not allocate seq.\n");
298 		exit(EX_UNAVAILABLE);
299 	    }
300 	}
301 	memcpy(sam_alignment->seq, temp_seq_or_qual, sam_alignment->seq_len + 1);
302     }
303 
304     // 11 QUAL, should be last field
305     if ( field_mask & BL_SAM_FIELD_QUAL )
306 	delim = tsv_read_field(sam_stream, temp_seq_or_qual, BL_SAM_SEQ_MAX_CHARS,
307 			       &sam_alignment->qual_len);
308     else
309     {
310 	delim = tsv_skip_field(sam_stream);
311 	// sam_alignment->qual = NULL; Mem leak if allocated elsewhere
312     }
313     if ( delim == EOF )
314     {
315 	fprintf(stderr, "bl_sam_read(): Got EOF reading qual: %s.\n",
316 		temp_seq_or_qual);
317 	return BL_READ_TRUNCATED;
318     }
319 
320     if ( field_mask & BL_SAM_FIELD_QUAL )
321     {
322 	// May be allocated by bl_sam_init() or bl_sam_copy()
323 	if ( sam_alignment->qual == NULL )
324 	{
325 	    if ( (sam_alignment->qual = xt_malloc(sam_alignment->qual_len + 1,
326 		    sizeof(*sam_alignment->qual))) == NULL )
327 	    {
328 		fprintf(stderr, "bl_sam_read(): Could not allocate qual.\n");
329 		exit(EX_UNAVAILABLE);
330 	    }
331 	}
332 	memcpy(sam_alignment->qual, temp_seq_or_qual,
333 	       sam_alignment->qual_len + 1);
334 
335 	if ( (sam_alignment->qual_len != 1) &&
336 	     (sam_alignment->seq_len != sam_alignment->qual_len) )
337 	    fprintf(stderr, "bl_sam_read(): Warning: qual_len != seq_len for %s,%" PRIu64 "\n",
338 		    sam_alignment->rname, sam_alignment->pos);
339     }
340 
341     // Some SRA CRAMs have 11 fields, most have 12
342     // Discard everything after the 11th
343     if ( delim == '\t' )
344 	while ( getc(sam_stream) != '\n' )
345 	    ;
346 
347     /*fprintf(stderr,"bl_sam_read(): %s,%" PRIu64 ",%zu\n",
348 	    BL_SAM_RNAME(sam_alignment), BL_SAM_POS(sam_alignment),
349 	    BL_SAM_SEQ_LEN(sam_alignment));*/
350 
351     return BL_READ_OK;
352 }
353 
354 
355 /***************************************************************************
356  *  Library:
357  *      #include <biolibc/sam.h>
358  *      -lbiolibc -lxtend
359  *
360  *  Description:
361  *      Copy a SAM alignment as efficiently as possible, allocating memory
362  *      as needed.
363  *
364  *  Arguments:
365  *      dest    Pointer to bl_sam_t structure to receive copy
366  *      src     Pointer to bl_sam_t structure to be copied
367  *
368  *  See also:
369  *      bl_sam_read(3), bl_sam_init(3), bl_sam_free(3)
370  *
371  *  History:
372  *  Date        Name        Modification
373  *  2020-05-27  Jason Bacon Begin
374  ***************************************************************************/
375 
bl_sam_copy(bl_sam_t * dest,bl_sam_t * src)376 void    bl_sam_copy(bl_sam_t *dest, bl_sam_t *src)
377 
378 {
379     strlcpy(dest->qname, src->qname, BL_SAM_QNAME_MAX_CHARS + 1);
380     dest->flag = src->flag;
381     strlcpy(dest->rname, src->rname, BL_SAM_RNAME_MAX_CHARS + 1);
382     dest->pos = src->pos;
383     dest->mapq = src->mapq;
384     // FIXME: Add cigar and RNEXT
385     strlcpy(dest->cigar, src->cigar, BL_SAM_CIGAR_MAX_CHARS + 1);
386     strlcpy(dest->rnext, src->rnext, BL_SAM_RNAME_MAX_CHARS + 1);
387     dest->pnext = src->pnext;
388     dest->tlen = src->tlen;
389 
390     if ( (dest->seq = xt_malloc(src->seq_len + 1,
391 	    sizeof(*dest->seq))) == NULL )
392     {
393 	fprintf(stderr, "bl_sam_copy(): Could not allocate seq.\n");
394 	exit(EX_UNAVAILABLE);
395     }
396     memcpy(dest->seq, src->seq, src->seq_len + 1);
397 
398     if ( (dest->qual = xt_malloc(src->seq_len + 1,
399 	    sizeof(*dest->qual))) == NULL )
400     {
401 	fprintf(stderr, "bl_sam_copy(): Could not allocate qual.\n");
402 	exit(EX_UNAVAILABLE);
403     }
404 
405     /* qual is an optional field */
406     if ( src->qual_len > 0 )
407 	memcpy(dest->qual, src->qual, src->qual_len + 1);
408 
409     dest->seq_len = src->seq_len;
410     dest->qual_len = src->qual_len;
411 }
412 
413 
414 /***************************************************************************
415  *  Library:
416  *      #include <biolibc/sam.h>
417  *      -lbiolibc -lxtend
418  *
419  *  Description:
420  *      Free memory allocated by bl_sam_read() or
421  *      bl_sam_init().
422  *
423  *  Arguments:
424  *      sam_alignment   Pointer to bl_sam_t structure to be freed.
425  *
426  *  See also:
427  *      bl_sam_read(3), bl_sam_init(3), bl_sam_copy(3)
428  *
429  *  History:
430  *  Date        Name        Modification
431  *  2020-05-29  Jason Bacon Begin
432  ***************************************************************************/
433 
bl_sam_free(bl_sam_t * sam_alignment)434 void    bl_sam_free(bl_sam_t *sam_alignment)
435 
436 {
437     if ( sam_alignment->seq != NULL )
438 	free(sam_alignment->seq);
439     if ( sam_alignment->qual != NULL )
440 	free(sam_alignment->qual);
441     // FIXME: Cigar and rnext?
442 }
443 
444 
445 /***************************************************************************
446  *  Library:
447  *      #include <biolibc/sam.h>
448  *      -lbiolibc -lxtend
449  *
450  *  Description:
451  *      Initialize a bl_sam_t structure, allocating memory for
452  *      sequence and quality strings according to seq_len.  Passing a
453  *      seq_len of 0 prevents memory allocation from occurring.
454  *
455  *      Only BL_SAM_FIELD_SEQ and BL_SAM_FIELD_QUAL are meaningful bits in
456  *      field_mask, as they determine whether memory is allocated.  All
457  *      other fields are unconditionally initialized to 0, NULL, or blank.
458  *
459  *  Arguments:
460  *      sam_alignment   Pointer to bl_sam_t structure to initialize
461  *      seq_len         Length of sequence and quality strings
462  *      field_mask      Bit mask indicating which fields will be used
463  *
464  *  See also:
465  *      bl_sam_read(3), bl_sam_free(3), bl_sam_copy(3)
466  *
467  *  History:
468  *  Date        Name        Modification
469  *  2020-05-29  Jason Bacon Begin
470  ***************************************************************************/
471 
bl_sam_init(bl_sam_t * sam_alignment,size_t seq_len,sam_field_mask_t field_mask)472 void    bl_sam_init(bl_sam_t *sam_alignment, size_t seq_len,
473 			   sam_field_mask_t field_mask)
474 
475 {
476     *sam_alignment->qname = '\0';
477     sam_alignment->flag = 0;
478     *sam_alignment->rname = '\0';
479     sam_alignment->pos = 0;
480     sam_alignment->mapq = 0;
481     *sam_alignment->cigar = '\0';
482     *sam_alignment->rnext = '\0';
483     sam_alignment->pnext = 0;
484     sam_alignment->tlen = 0;
485     if ( seq_len == 0 )
486     {
487 	sam_alignment->seq = NULL;
488 	sam_alignment->qual = NULL;
489     }
490     else
491     {
492 	if ( seq_len != 0 )
493 	{
494 	    if ( (field_mask & BL_SAM_FIELD_SEQ) &&
495 		 ((sam_alignment->seq = xt_malloc(seq_len + 1,
496 			sizeof(*sam_alignment->seq))) == NULL) )
497 	    {
498 		fprintf(stderr, "bl_sam_init(): Could not allocate seq.\n");
499 		exit(EX_UNAVAILABLE);
500 	    }
501 	    if ( (field_mask & BL_SAM_FIELD_QUAL) &&
502 		 ((sam_alignment->qual = xt_malloc(seq_len + 1,
503 			sizeof(*sam_alignment->qual))) == NULL) )
504 	    {
505 		fprintf(stderr, "bl_sam_init(): Could not allocate qual.\n");
506 		exit(EX_UNAVAILABLE);
507 	    }
508 	}
509     }
510     sam_alignment->seq_len = seq_len;
511 }
512 
513 
514 /***************************************************************************
515  *  Library:
516  *      #include <biolibc/sam.h>
517  *      -lbiolibc -lxtend
518  *
519  *  Description:
520  *      Write an alignment (line) to a SAM stream.
521  *
522  *      If field_mask is not BL_SAM_FIELD_ALL, fields not indicated by a 1
523  *      in the bit mask are written as an appropriate placeholder such as '.'
524  *      rather than stored in sam_alignment.  Possible mask values are:
525  *
526  *      BL_SAM_FIELD_ALL
527  *      BL_SAM_FIELD_QNAME
528  *      BL_SAM_FIELD_FLAG
529  *      BL_SAM_FIELD_RNAME
530  *      BL_SAM_FIELD_POS
531  *      BL_SAM_FIELD_MAPQ
532  *      BL_SAM_FIELD_CIGAR
533  *      BL_SAM_FIELD_RNEXT
534  *      BL_SAM_FIELD_PNEXT
535  *      BL_SAM_FIELD_TLEN
536  *      BL_SAM_FIELD_SEQ
537  *      BL_SAM_FIELD_QUAL
538  *
539  *  Arguments:
540  *      sam_stream      A FILE stream to which to write the line
541  *      sam_alignment   Pointer to a bl_sam_t structure
542  *      field_mask      Bit mask indicating which fields to store in sam_alignment
543  *
544  *  Returns:
545  *      Number of items written (per fprintf() output)
546  *
547  *  See also:
548  *      bl_sam_read(3)
549  *
550  *  History:
551  *  Date        Name        Modification
552  *  2019-12-09  Jason Bacon Begin
553  ***************************************************************************/
554 
bl_sam_write(FILE * sam_stream,bl_sam_t * sam_alignment,sam_field_mask_t field_mask)555 int     bl_sam_write(FILE *sam_stream, bl_sam_t *sam_alignment,
556 			   sam_field_mask_t field_mask)
557 
558 {
559     int     count;
560 
561     count = fprintf(sam_stream, "%s\t%u\t%s\t%" PRIu64
562 		    "\t%u\t%s\t%s\t%" PRIu64 "zu\t%zu\t%s\t%s\t%zu\t%zu\n",
563 		    sam_alignment->qname,
564 		    sam_alignment->flag,
565 		    sam_alignment->rname,
566 		    sam_alignment->pos,
567 		    sam_alignment->mapq,
568 		    sam_alignment->cigar,
569 		    sam_alignment->rnext,
570 		    sam_alignment->pnext,
571 		    sam_alignment->tlen,
572 		    sam_alignment->seq,
573 		    sam_alignment->qual,
574 		    sam_alignment->seq_len,
575 		    sam_alignment->qual_len);
576     return count;
577 }
578