1 #include <sysexits.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <sys/param.h> // MIN()
5 #include <xtend/string.h> // strlcpy() on Linux
6 #include <xtend/mem.h>
7 #include <inttypes.h>
8 #include "sam-buff.h"
9 #include "biostring.h"
10
11 /***************************************************************************
12 * Library:
13 * #include <biolibc/sam-buff.h>
14 * -lbiolibc -lxtend
15 *
16 * Description:
17 * Check that the newly read SAM alignment comes after the previous
18 * one, assuming the input is sorted first by chrom and then
19 * position. The previous chrom and position are stored in
20 * sam_buff (and initialized so that the first SAM alignment read is
21 * always OK).
22 *
23 * Arguments:
24 * sam_buff Pointer to a SAM buffer with recent alignments
25 * sam_alignment Pointer to the most recently read alignment
26 *
27 * See also:
28 * bl_sam_read(3)
29 *
30 * History:
31 * Date Name Modification
32 * 2020-05-27 Jason Bacon Begin
33 ***************************************************************************/
34
bl_sam_buff_check_order(bl_sam_buff_t * sam_buff,bl_sam_t * sam_alignment)35 void bl_sam_buff_check_order(bl_sam_buff_t *sam_buff,
36 bl_sam_t *sam_alignment)
37
38 {
39 /*fprintf(stderr, "Previous SAM: %s %zu, Current SAM: %s %zu\n",
40 sam_buff->previous_rname, sam_buff->previous_pos,
41 sam_alignment->rname, sam_alignment->pos);*/
42 if ( strcmp(sam_alignment->rname, sam_buff->previous_rname) == 0 )
43 {
44 // Silly to assign when already ==, but sillier to add another check
45 if (sam_alignment->pos < sam_buff->previous_pos )
46 bl_sam_buff_out_of_order(sam_buff, sam_alignment);
47 else
48 sam_buff->previous_pos = sam_alignment->pos;
49 }
50 else if ( bl_chrom_name_cmp(sam_alignment->rname,
51 sam_buff->previous_rname) < 0 )
52 bl_sam_buff_out_of_order(sam_buff, sam_alignment);
53 else
54 {
55 strlcpy(sam_buff->previous_rname, sam_alignment->rname, BL_SAM_RNAME_MAX_CHARS);
56 sam_buff->previous_pos = sam_alignment->pos;
57 }
58 }
59
60
61 /***************************************************************************
62 * Library:
63 * #include <biolibc/sam-buff.h>
64 * -lbiolibc -lxtend
65 *
66 * Description:
67 * Initialize a SAM alignment buffer for holding recently read SAM
68 * alignments. This is useful, for example, when scanning a SAM
69 * stream for alignments overlapping a certain region or position.
70 * The buffer array is set to a
71 * reasonable initial size and extended as far as BL_SAM_BUFF_MAX_SIZE
72 * by bl_sam_buff_add_alignment(3) if needed. A minimum MAPQ value
73 * is stored in the bl_sam_buff_t structure for filtering with
74 * bl_sam_buff_alignment_ok(3).
75 *
76 * Arguments:
77 * sam_buff Pointer to a the bl_sam_buff_t structure to initialize
78 * mapq_min User-selected minimum MAPQ value
79 *
80 * See also:
81 * bl_sam_buff_check_order(3), bl_sam_read(3)
82 *
83 * History:
84 * Date Name Modification
85 * 2020-05-27 Jason Bacon Begin
86 ***************************************************************************/
87
bl_sam_buff_init(bl_sam_buff_t * sam_buff,unsigned int mapq_min,size_t max_alignments)88 void bl_sam_buff_init(bl_sam_buff_t *sam_buff, unsigned int mapq_min,
89 size_t max_alignments)
90
91 {
92 size_t c;
93
94 sam_buff->buff_size = BL_SAM_BUFF_START_SIZE;
95 sam_buff->max_alignments = max_alignments;
96 sam_buff->buffered_count = 0;
97 sam_buff->max_count = 0;
98 sam_buff->previous_pos = 0;
99 *sam_buff->previous_rname = '\0';
100
101 sam_buff->mapq_min = mapq_min;
102 sam_buff->mapq_low = UINT64_MAX;
103 sam_buff->mapq_high = 0;
104 sam_buff->mapq_sum = 0;
105 sam_buff->reads_used = 0;
106
107 sam_buff->total_alignments = 0;
108 sam_buff->trailing_alignments = 0;
109 sam_buff->discarded_alignments = 0;
110 sam_buff->discarded_score_sum = 0;
111 sam_buff->min_discarded_score = SIZE_MAX;
112 sam_buff->max_discarded_score = 0;
113 sam_buff->discarded_trailing = 0;
114 sam_buff->unmapped_alignments = 0;
115
116 /*
117 * Dynamically allocating the pointers is probably senseless since they
118 * take very little space compared to the alignment data. By the time
119 * the pointer array takes a significant amount of RAM, you're probably
120 * already thrashing to accomodate the sequence data. The pointer array
121 * size is capped by BL_SAM_BUFF_MAX_SIZE to prevent memory exhaustion.
122 * We may save a few megabytes with this, though.
123 */
124 sam_buff->alignments =
125 (bl_sam_t **)xt_malloc(sam_buff->buff_size,
126 sizeof(bl_sam_t **));
127 for (c = 0; c < sam_buff->buff_size; ++c)
128 sam_buff->alignments[c] = NULL;
129 }
130
131
132 /***************************************************************************
133 * Library:
134 * #include <biolibc/sam-buff.h>
135 * -lbiolibc -lxtend
136 *
137 * Description:
138 * Add a new alignment to the buffer, expanding the array as needed
139 * up to BL_SAM_BUFF_MAX_SIZE.
140 *
141 * Arguments:
142 * sam_buff Pointer to bl_sam_buff_t structure where alignments are buffered
143 * sam_alignment New SAM alignment to add to buffer
144 *
145 * See also:
146 * bl_sam_buff_init(3), bl_sam_buff_check_order(3)
147 *
148 * History:
149 * Date Name Modification
150 * 2020-05-27 Jason Bacon Begin
151 ***************************************************************************/
152
bl_sam_buff_add_alignment(bl_sam_buff_t * sam_buff,bl_sam_t * sam_alignment)153 int bl_sam_buff_add_alignment(bl_sam_buff_t *sam_buff,
154 bl_sam_t *sam_alignment)
155
156 {
157 size_t old_buff_size,
158 c;
159
160 bl_sam_buff_check_order(sam_buff, sam_alignment);
161
162 sam_buff->mapq_low = MIN(sam_buff->mapq_low, BL_SAM_MAPQ(sam_alignment));
163 sam_buff->mapq_high = MAX(sam_buff->mapq_high, BL_SAM_MAPQ(sam_alignment));
164 sam_buff->mapq_sum += BL_SAM_MAPQ(sam_alignment);
165 ++sam_buff->reads_used;
166
167 // Just allocate the static fields, bl_sam_copy() does the rest
168 if ( sam_buff->alignments[sam_buff->buffered_count] == NULL )
169 {
170 //fprintf(stderr, "Allocating alignment #%zu\n", sam_buff->buffered_count);
171 sam_buff->alignments[sam_buff->buffered_count] =
172 xt_malloc(1, sizeof(bl_sam_t));
173 if ( sam_buff->alignments[sam_buff->buffered_count] == NULL )
174 {
175 fprintf(stderr, "bl_sam_buff_add_alignment(): Could not allocate alignments.\n");
176 exit(EX_UNAVAILABLE);
177 }
178 // Redundant to bl_sam_copy()
179 // bl_sam_init(sam_buff->alignments[sam_buff->buffered_count], 0);
180 }
181 else
182 bl_sam_free(sam_buff->alignments[sam_buff->buffered_count]);
183
184 bl_sam_copy(sam_buff->alignments[sam_buff->buffered_count], sam_alignment);
185
186 ++sam_buff->buffered_count;
187
188 if ( sam_buff->buffered_count > sam_buff->max_count )
189 {
190 sam_buff->max_count = sam_buff->buffered_count;
191 // fprintf(stderr, "sam_buff->max_count = %zu\n", sam_buff->max_count);
192 }
193
194 if ( sam_buff->buffered_count == sam_buff->max_alignments )
195 {
196 fprintf(stderr,
197 "bl_sam_buff_add_alignment(): Hit maximum alignments=%zu.\n",
198 sam_buff->max_alignments);
199 fprintf(stderr, "Aborting add to prevent runaway memory use.\n");
200 fprintf(stderr, "Check your SAM input.\n");
201 return BL_SAM_BUFF_ADD_FAILED;
202 }
203
204 if ( sam_buff->buffered_count == sam_buff->buff_size )
205 {
206 fprintf(stderr,
207 "bl_sam_buff_add_alignment(): Hit buff_size=%zu, doubling buffer size.\n",
208 sam_buff->buff_size);
209 fprintf(stderr, "RNAME: %s POS: %" PRIu64 " LEN: %zu\n",
210 BL_SAM_RNAME(sam_alignment), BL_SAM_POS(sam_alignment),
211 BL_SAM_SEQ_LEN(sam_alignment));
212 old_buff_size = sam_buff->buff_size;
213 sam_buff->buff_size *= 2;
214 sam_buff->alignments =
215 (bl_sam_t **)xt_realloc(sam_buff->alignments,
216 sam_buff->buff_size,
217 sizeof(bl_sam_t **));
218 for (c = old_buff_size; c < sam_buff->buff_size; ++c)
219 sam_buff->alignments[c] = NULL;
220 }
221 return BL_SAM_BUFF_OK;
222 }
223
224
225 /***************************************************************************
226 * Library:
227 * #include <biolibc/sam-buff.h>
228 * -lbiolibc -lxtend
229 *
230 * Description:
231 * Report SAM input sort error and terminate process.
232 *
233 * Arguments:
234 * sam_buff Pointer to bl_sam_buff_t structure
235 * sam_alignment Offending alignment out of order with previous
236 *
237 * Returns:
238 * Does not return.
239 *
240 * See also:
241 * bl_sam_buff_alignment_ok(3)
242 *
243 * History:
244 * Date Name Modification
245 * 2020-05-27 Jason Bacon Begin
246 ***************************************************************************/
247
bl_sam_buff_out_of_order(bl_sam_buff_t * sam_buff,bl_sam_t * sam_alignment)248 void bl_sam_buff_out_of_order(bl_sam_buff_t *sam_buff, bl_sam_t *sam_alignment)
249
250 {
251 fprintf(stderr, "Error: SAM input must be sorted by chrom and then position.\n");
252 fprintf(stderr, "Found %s,%" PRIu64 " after %s,%" PRIu64 ".\n",
253 BL_SAM_RNAME(sam_alignment), BL_SAM_POS(sam_alignment),
254 sam_buff->previous_rname, sam_buff->previous_pos);
255 exit(EX_DATAERR);
256 }
257
258
259 /***************************************************************************
260 * Library:
261 * #include <biolibc/sam-buff.h>
262 * -lbiolibc -lxtend
263 *
264 * Description:
265 * Free an element of the SAM alignment array by first freeing all
266 * memory allocated by the bl_sam_t structure and then freeing
267 * memory allocated for the structure itself.
268 *
269 * Arguments:
270 * sam_buff Pointer to the bl_sam_buff_t structure holding alignments
271 * c Index of the alignment to be freed (0-based)
272 *
273 * See also:
274 * bl_sam_buff_init(3), bl_sam_buff_add_alignment(3)
275 *
276 * History:
277 * Date Name Modification
278 * 2020-05-29 Jason Bacon Begin
279 ***************************************************************************/
280
bl_sam_buff_free_alignment(bl_sam_buff_t * sam_buff,size_t c)281 void bl_sam_buff_free_alignment(bl_sam_buff_t *sam_buff, size_t c)
282
283 {
284 bl_sam_free(sam_buff->alignments[c]);
285 bl_sam_init(sam_buff->alignments[c], 0, BL_SAM_FIELD_ALL);
286 if ( sam_buff->alignments[c] != NULL )
287 {
288 free(sam_buff->alignments[c]);
289 sam_buff->alignments[c] = NULL;
290 }
291 }
292
293
294 /***************************************************************************
295 * Library:
296 * #include <biolibc/sam-buff.h>
297 * -lbiolibc -lxtend
298 *
299 * Description:
300 * Free nelem SAM alignments at the head of the queue and shift
301 * remaining elements forward nelem positions.
302 *
303 * Arguments:
304 * sam_buff Pointer to bl_sam_buff_t structure holding alignments
305 * nelem Number of alignments to free
306 *
307 * See also:
308 * bl_sam_buff_free_alignment(3)
309 *
310 * FIXME: Use circular queuing for better efficiency
311 *
312 * History:
313 * Date Name Modification
314 * 2020-05-29 Jason Bacon Begin
315 ***************************************************************************/
316
bl_sam_buff_shift(bl_sam_buff_t * sam_buff,size_t nelem)317 void bl_sam_buff_shift(bl_sam_buff_t *sam_buff, size_t nelem)
318
319 {
320 size_t c;
321
322 /*
323 * FIXME: A circular queue would be more efficient, but won't matter
324 * much to the bottom line to avoid shifting a few pointers on top
325 * of everything else going on here
326 */
327
328 /* Make susre elements to be removed are freed */
329 for (c = 0; c < nelem; ++c)
330 bl_sam_buff_free_alignment(sam_buff, c);
331
332 /* Shift elements */
333 for (c = 0; c < sam_buff->buffered_count - nelem; ++c)
334 sam_buff->alignments[c] = sam_buff->alignments[c + nelem];
335
336 /* Clear vacated elements */
337 while ( c < sam_buff->buffered_count )
338 sam_buff->alignments[c++] = NULL;
339
340 sam_buff->buffered_count -= nelem;
341 }
342
343
344 /***************************************************************************
345 * Library:
346 * #include <biolibc/sam-buff.h>
347 * -lbiolibc -lxtend
348 *
349 * Description:
350 * Verify that an alignment meets the quality requirements for the
351 * given SAM buffer. Each bl_sam_buff_t structure contains
352 * specifications for minimum quality scores as well as statistics
353 * on alignments checked, including the number of discarded alignments
354 * and a sum of the alignment scores (for calculating the average
355 * score of discarded alignments).
356 *
357 * Arguments:
358 * sam_buff Pointer to bl_sam_buff_t structure with quality specs
359 * sam_alignment Pointer to new alignment
360 *
361 * Returns:
362 * true if the alignment meets the quality requirements
363 * false otherwise
364 *
365 * See also:
366 * bl_sam_buff_check_order(3)
367 *
368 * History:
369 * Date Name Modification
370 * 2020-05-29 Jason Bacon Begin
371 ***************************************************************************/
372
bl_sam_buff_alignment_ok(bl_sam_buff_t * sam_buff,bl_sam_t * sam_alignment)373 bool bl_sam_buff_alignment_ok(bl_sam_buff_t *sam_buff,
374 bl_sam_t *sam_alignment)
375
376 {
377 if ( sam_alignment->flag & BAM_FUNMAP )
378 {
379 ++sam_buff->unmapped_alignments;
380 #ifdef DEBUG
381 fprintf(stderr, "Discarding unmapped read: %s,%zu\n",
382 BL_SAM_RNAME(sam_alignment), BL_SAM_POS(sam_alignment));
383 #endif
384 return false;
385 }
386 else if ( BL_SAM_MAPQ(sam_alignment) < BL_SAM_BUFF_MAPQ_MIN(sam_buff) )
387 {
388 ++sam_buff->discarded_alignments;
389 sam_buff->discarded_score_sum += BL_SAM_MAPQ(sam_alignment);
390 if ( BL_SAM_MAPQ(sam_alignment) < sam_buff->min_discarded_score )
391 sam_buff->min_discarded_score = BL_SAM_MAPQ(sam_alignment);
392 if ( BL_SAM_MAPQ(sam_alignment) > sam_buff->max_discarded_score )
393 sam_buff->max_discarded_score = BL_SAM_MAPQ(sam_alignment);
394
395 #ifdef DEBUG
396 fprintf(stderr, "bl_sam_buff_alignment_ok(): Discarding low quality alignment: %s,%zu MAPQ=%u\n",
397 BL_SAM_RNAME(sam_alignment), BL_SAM_POS(sam_alignment),
398 BL_SAM_MAPQ(sam_alignment));
399 #endif
400 return false;
401 }
402 else
403 return true;
404 }
405