1 #include <sysexits.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <sys/param.h>  // MIN()
5 #include <xtend/string.h>      // strlcpy() on Linux
6 #include <xtend/mem.h>
7 #include <inttypes.h>
8 #include "sam-buff.h"
9 #include "biostring.h"
10 
11 /***************************************************************************
12  *  Library:
13  *      #include <biolibc/sam-buff.h>
14  *      -lbiolibc -lxtend
15  *
16  *  Description:
17  *      Check that the newly read SAM alignment comes after the previous
18  *      one, assuming the input is sorted first by chrom and then
19  *      position.  The previous chrom and position are stored in
20  *      sam_buff (and initialized so that the first SAM alignment read is
21  *      always OK).
22  *
23  *  Arguments:
24  *      sam_buff        Pointer to a SAM buffer with recent alignments
25  *      sam_alignment   Pointer to the most recently read alignment
26  *
27  *  See also:
28  *      bl_sam_read(3)
29  *
30  *  History:
31  *  Date        Name        Modification
32  *  2020-05-27  Jason Bacon Begin
33  ***************************************************************************/
34 
bl_sam_buff_check_order(bl_sam_buff_t * sam_buff,bl_sam_t * sam_alignment)35 void    bl_sam_buff_check_order(bl_sam_buff_t *sam_buff,
36 			     bl_sam_t *sam_alignment)
37 
38 {
39     /*fprintf(stderr, "Previous SAM: %s %zu, Current SAM: %s %zu\n",
40 	    sam_buff->previous_rname, sam_buff->previous_pos,
41 	    sam_alignment->rname, sam_alignment->pos);*/
42     if ( strcmp(sam_alignment->rname, sam_buff->previous_rname) == 0 )
43     {
44 	// Silly to assign when already ==, but sillier to add another check
45 	if (sam_alignment->pos < sam_buff->previous_pos )
46 	    bl_sam_buff_out_of_order(sam_buff, sam_alignment);
47 	else
48 	    sam_buff->previous_pos = sam_alignment->pos;
49     }
50     else if ( bl_chrom_name_cmp(sam_alignment->rname,
51 				  sam_buff->previous_rname) < 0 )
52 	bl_sam_buff_out_of_order(sam_buff, sam_alignment);
53     else
54     {
55 	strlcpy(sam_buff->previous_rname, sam_alignment->rname, BL_SAM_RNAME_MAX_CHARS);
56 	sam_buff->previous_pos = sam_alignment->pos;
57     }
58 }
59 
60 
61 /***************************************************************************
62  *  Library:
63  *      #include <biolibc/sam-buff.h>
64  *      -lbiolibc -lxtend
65  *
66  *  Description:
67  *      Initialize a SAM alignment buffer for holding recently read SAM
68  *      alignments.  This is useful, for example, when scanning a SAM
69  *      stream for alignments overlapping a certain region or position.
70  *      The buffer array is set to a
71  *      reasonable initial size and extended as far as BL_SAM_BUFF_MAX_SIZE
72  *      by bl_sam_buff_add_alignment(3) if needed.  A minimum MAPQ value
73  *      is stored in the bl_sam_buff_t structure for filtering with
74  *      bl_sam_buff_alignment_ok(3).
75  *
76  *  Arguments:
77  *      sam_buff    Pointer to a the bl_sam_buff_t structure to initialize
78  *      mapq_min    User-selected minimum MAPQ value
79  *
80  *  See also:
81  *      bl_sam_buff_check_order(3), bl_sam_read(3)
82  *
83  *  History:
84  *  Date        Name        Modification
85  *  2020-05-27  Jason Bacon Begin
86  ***************************************************************************/
87 
bl_sam_buff_init(bl_sam_buff_t * sam_buff,unsigned int mapq_min,size_t max_alignments)88 void    bl_sam_buff_init(bl_sam_buff_t *sam_buff, unsigned int mapq_min,
89 			 size_t max_alignments)
90 
91 {
92     size_t  c;
93 
94     sam_buff->buff_size = BL_SAM_BUFF_START_SIZE;
95     sam_buff->max_alignments = max_alignments;
96     sam_buff->buffered_count = 0;
97     sam_buff->max_count = 0;
98     sam_buff->previous_pos = 0;
99     *sam_buff->previous_rname = '\0';
100 
101     sam_buff->mapq_min = mapq_min;
102     sam_buff->mapq_low = UINT64_MAX;
103     sam_buff->mapq_high = 0;
104     sam_buff->mapq_sum = 0;
105     sam_buff->reads_used = 0;
106 
107     sam_buff->total_alignments = 0;
108     sam_buff->trailing_alignments = 0;
109     sam_buff->discarded_alignments = 0;
110     sam_buff->discarded_score_sum = 0;
111     sam_buff->min_discarded_score = SIZE_MAX;
112     sam_buff->max_discarded_score = 0;
113     sam_buff->discarded_trailing = 0;
114     sam_buff->unmapped_alignments = 0;
115 
116     /*
117      *  Dynamically allocating the pointers is probably senseless since they
118      *  take very little space compared to the alignment data.  By the time
119      *  the pointer array takes a significant amount of RAM, you're probably
120      *  already thrashing to accomodate the sequence data.  The pointer array
121      *  size is capped by BL_SAM_BUFF_MAX_SIZE to prevent memory exhaustion.
122      *  We may save a few megabytes with this, though.
123      */
124     sam_buff->alignments =
125 	(bl_sam_t **)xt_malloc(sam_buff->buff_size,
126 				   sizeof(bl_sam_t **));
127     for (c = 0; c < sam_buff->buff_size; ++c)
128 	sam_buff->alignments[c] = NULL;
129 }
130 
131 
132 /***************************************************************************
133  *  Library:
134  *      #include <biolibc/sam-buff.h>
135  *      -lbiolibc -lxtend
136  *
137  *  Description:
138  *      Add a new alignment to the buffer, expanding the array as needed
139  *      up to BL_SAM_BUFF_MAX_SIZE.
140  *
141  *  Arguments:
142  *      sam_buff    Pointer to bl_sam_buff_t structure where alignments are buffered
143  *      sam_alignment   New SAM alignment to add to buffer
144  *
145  *  See also:
146  *      bl_sam_buff_init(3), bl_sam_buff_check_order(3)
147  *
148  *  History:
149  *  Date        Name        Modification
150  *  2020-05-27  Jason Bacon Begin
151  ***************************************************************************/
152 
bl_sam_buff_add_alignment(bl_sam_buff_t * sam_buff,bl_sam_t * sam_alignment)153 int     bl_sam_buff_add_alignment(bl_sam_buff_t *sam_buff,
154 			       bl_sam_t *sam_alignment)
155 
156 {
157     size_t  old_buff_size,
158 	    c;
159 
160     bl_sam_buff_check_order(sam_buff, sam_alignment);
161 
162     sam_buff->mapq_low = MIN(sam_buff->mapq_low, BL_SAM_MAPQ(sam_alignment));
163     sam_buff->mapq_high = MAX(sam_buff->mapq_high, BL_SAM_MAPQ(sam_alignment));
164     sam_buff->mapq_sum += BL_SAM_MAPQ(sam_alignment);
165     ++sam_buff->reads_used;
166 
167     // Just allocate the static fields, bl_sam_copy() does the rest
168     if ( sam_buff->alignments[sam_buff->buffered_count] == NULL )
169     {
170 	//fprintf(stderr, "Allocating alignment #%zu\n", sam_buff->buffered_count);
171 	sam_buff->alignments[sam_buff->buffered_count] =
172 	    xt_malloc(1, sizeof(bl_sam_t));
173 	if ( sam_buff->alignments[sam_buff->buffered_count] == NULL )
174 	{
175 	    fprintf(stderr, "bl_sam_buff_add_alignment(): Could not allocate alignments.\n");
176 	    exit(EX_UNAVAILABLE);
177 	}
178 	// Redundant to bl_sam_copy()
179 	// bl_sam_init(sam_buff->alignments[sam_buff->buffered_count], 0);
180     }
181     else
182 	bl_sam_free(sam_buff->alignments[sam_buff->buffered_count]);
183 
184     bl_sam_copy(sam_buff->alignments[sam_buff->buffered_count], sam_alignment);
185 
186     ++sam_buff->buffered_count;
187 
188     if ( sam_buff->buffered_count > sam_buff->max_count )
189     {
190 	sam_buff->max_count = sam_buff->buffered_count;
191 	// fprintf(stderr, "sam_buff->max_count = %zu\n", sam_buff->max_count);
192     }
193 
194     if ( sam_buff->buffered_count == sam_buff->max_alignments )
195     {
196 	fprintf(stderr,
197 		"bl_sam_buff_add_alignment(): Hit maximum alignments=%zu.\n",
198 		sam_buff->max_alignments);
199 	fprintf(stderr, "Aborting add to prevent runaway memory use.\n");
200 	fprintf(stderr, "Check your SAM input.\n");
201 	return BL_SAM_BUFF_ADD_FAILED;
202     }
203 
204     if ( sam_buff->buffered_count == sam_buff->buff_size )
205     {
206 	fprintf(stderr,
207 		"bl_sam_buff_add_alignment(): Hit buff_size=%zu, doubling buffer size.\n",
208 		sam_buff->buff_size);
209 	fprintf(stderr, "RNAME: %s  POS: %" PRIu64 " LEN: %zu\n",
210 		BL_SAM_RNAME(sam_alignment), BL_SAM_POS(sam_alignment),
211 		BL_SAM_SEQ_LEN(sam_alignment));
212 	old_buff_size = sam_buff->buff_size;
213 	sam_buff->buff_size *= 2;
214 	sam_buff->alignments =
215 	    (bl_sam_t **)xt_realloc(sam_buff->alignments,
216 					sam_buff->buff_size,
217 					sizeof(bl_sam_t **));
218 	for (c = old_buff_size; c < sam_buff->buff_size; ++c)
219 	    sam_buff->alignments[c] = NULL;
220     }
221     return BL_SAM_BUFF_OK;
222 }
223 
224 
225 /***************************************************************************
226  *  Library:
227  *      #include <biolibc/sam-buff.h>
228  *      -lbiolibc -lxtend
229  *
230  *  Description:
231  *      Report SAM input sort error and terminate process.
232  *
233  *  Arguments:
234  *      sam_buff        Pointer to bl_sam_buff_t structure
235  *      sam_alignment   Offending alignment out of order with previous
236  *
237  *  Returns:
238  *      Does not return.
239  *
240  *  See also:
241  *      bl_sam_buff_alignment_ok(3)
242  *
243  *  History:
244  *  Date        Name        Modification
245  *  2020-05-27  Jason Bacon Begin
246  ***************************************************************************/
247 
bl_sam_buff_out_of_order(bl_sam_buff_t * sam_buff,bl_sam_t * sam_alignment)248 void    bl_sam_buff_out_of_order(bl_sam_buff_t *sam_buff, bl_sam_t *sam_alignment)
249 
250 {
251     fprintf(stderr, "Error: SAM input must be sorted by chrom and then position.\n");
252     fprintf(stderr, "Found %s,%" PRIu64 " after %s,%" PRIu64 ".\n",
253 	    BL_SAM_RNAME(sam_alignment), BL_SAM_POS(sam_alignment),
254 	    sam_buff->previous_rname, sam_buff->previous_pos);
255     exit(EX_DATAERR);
256 }
257 
258 
259 /***************************************************************************
260  *  Library:
261  *      #include <biolibc/sam-buff.h>
262  *      -lbiolibc -lxtend
263  *
264  *  Description:
265  *      Free an element of the SAM alignment array by first freeing all
266  *      memory allocated by the bl_sam_t structure and then freeing
267  *      memory allocated for the structure itself.
268  *
269  *  Arguments:
270  *      sam_buff    Pointer to the bl_sam_buff_t structure holding alignments
271  *      c           Index of the alignment to be freed (0-based)
272  *
273  *  See also:
274  *      bl_sam_buff_init(3), bl_sam_buff_add_alignment(3)
275  *
276  *  History:
277  *  Date        Name        Modification
278  *  2020-05-29  Jason Bacon Begin
279  ***************************************************************************/
280 
bl_sam_buff_free_alignment(bl_sam_buff_t * sam_buff,size_t c)281 void    bl_sam_buff_free_alignment(bl_sam_buff_t *sam_buff, size_t c)
282 
283 {
284     bl_sam_free(sam_buff->alignments[c]);
285     bl_sam_init(sam_buff->alignments[c], 0, BL_SAM_FIELD_ALL);
286     if ( sam_buff->alignments[c] != NULL )
287     {
288 	free(sam_buff->alignments[c]);
289 	sam_buff->alignments[c] = NULL;
290     }
291 }
292 
293 
294 /***************************************************************************
295  *  Library:
296  *      #include <biolibc/sam-buff.h>
297  *      -lbiolibc -lxtend
298  *
299  *  Description:
300  *      Free nelem SAM alignments at the head of the queue and shift
301  *      remaining elements forward nelem positions.
302  *
303  *  Arguments:
304  *      sam_buff    Pointer to bl_sam_buff_t structure holding alignments
305  *      nelem       Number of alignments to free
306  *
307  *  See also:
308  *      bl_sam_buff_free_alignment(3)
309  *
310  *  FIXME: Use circular queuing for better efficiency
311  *
312  *  History:
313  *  Date        Name        Modification
314  *  2020-05-29  Jason Bacon Begin
315  ***************************************************************************/
316 
bl_sam_buff_shift(bl_sam_buff_t * sam_buff,size_t nelem)317 void    bl_sam_buff_shift(bl_sam_buff_t *sam_buff, size_t nelem)
318 
319 {
320     size_t  c;
321 
322     /*
323      *  FIXME: A circular queue would be more efficient, but won't matter
324      *  much to the bottom line to avoid shifting a few pointers on top
325      *  of everything else going on here
326      */
327 
328     /* Make susre elements to be removed are freed */
329     for (c = 0; c < nelem; ++c)
330 	bl_sam_buff_free_alignment(sam_buff, c);
331 
332     /* Shift elements */
333     for (c = 0; c < sam_buff->buffered_count - nelem; ++c)
334 	sam_buff->alignments[c] = sam_buff->alignments[c + nelem];
335 
336     /* Clear vacated elements */
337     while ( c < sam_buff->buffered_count )
338 	sam_buff->alignments[c++] = NULL;
339 
340     sam_buff->buffered_count -= nelem;
341 }
342 
343 
344 /***************************************************************************
345  *  Library:
346  *      #include <biolibc/sam-buff.h>
347  *      -lbiolibc -lxtend
348  *
349  *  Description:
350  *      Verify that an alignment meets the quality requirements for the
351  *      given SAM buffer.  Each bl_sam_buff_t structure contains
352  *      specifications for minimum quality scores as well as statistics
353  *      on alignments checked, including the number of discarded alignments
354  *      and a sum of the alignment scores (for calculating the average
355  *      score of discarded alignments).
356  *
357  *  Arguments:
358  *      sam_buff    Pointer to bl_sam_buff_t structure with quality specs
359  *      sam_alignment   Pointer to new alignment
360  *
361  *  Returns:
362  *      true if the alignment meets the quality requirements
363  *      false otherwise
364  *
365  *  See also:
366  *      bl_sam_buff_check_order(3)
367  *
368  *  History:
369  *  Date        Name        Modification
370  *  2020-05-29  Jason Bacon Begin
371  ***************************************************************************/
372 
bl_sam_buff_alignment_ok(bl_sam_buff_t * sam_buff,bl_sam_t * sam_alignment)373 bool    bl_sam_buff_alignment_ok(bl_sam_buff_t *sam_buff,
374 			      bl_sam_t *sam_alignment)
375 
376 {
377     if ( sam_alignment->flag & BAM_FUNMAP )
378     {
379 	++sam_buff->unmapped_alignments;
380 #ifdef DEBUG
381 	fprintf(stderr, "Discarding unmapped read: %s,%zu\n",
382 		BL_SAM_RNAME(sam_alignment), BL_SAM_POS(sam_alignment));
383 #endif
384 	return false;
385     }
386     else if ( BL_SAM_MAPQ(sam_alignment) < BL_SAM_BUFF_MAPQ_MIN(sam_buff) )
387     {
388 	++sam_buff->discarded_alignments;
389 	sam_buff->discarded_score_sum += BL_SAM_MAPQ(sam_alignment);
390 	if ( BL_SAM_MAPQ(sam_alignment) < sam_buff->min_discarded_score )
391 	    sam_buff->min_discarded_score = BL_SAM_MAPQ(sam_alignment);
392 	if ( BL_SAM_MAPQ(sam_alignment) > sam_buff->max_discarded_score )
393 	    sam_buff->max_discarded_score = BL_SAM_MAPQ(sam_alignment);
394 
395 #ifdef DEBUG
396 	fprintf(stderr, "bl_sam_buff_alignment_ok(): Discarding low quality alignment: %s,%zu MAPQ=%u\n",
397 		BL_SAM_RNAME(sam_alignment), BL_SAM_POS(sam_alignment),
398 		BL_SAM_MAPQ(sam_alignment));
399 #endif
400 	return false;
401     }
402     else
403 	return true;
404 }
405