1 #include "samtools.pysam.h"
2
3 /* stats.c -- This is the former bamcheck integrated into samtools/htslib.
4
5 Copyright (C) 2020-2021 Genome Research Ltd.
6
7 Author: James Bonfield <jkb@sanger.ac.uk>
8
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 DEALINGS IN THE SOFTWARE. */
26
27 /*
28 * This tool is designed to give "samtools stats" style output, but dedicated
29 * to small amplicon sequencing projects. It gathers stats on the
30 * distribution of reads across amplicons.
31 */
32
33 /*
34 * TODO:
35 * - Cope with multiple references. What do we do here? Just request one?
36 * - Permit regions rather than consuming whole file (maybe solves above).
37 */
38
39 #include <config.h>
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <string.h>
43 #include <inttypes.h>
44 #include <getopt.h>
45 #include <unistd.h>
46 #include <math.h>
47
48 #include <htslib/sam.h>
49 #include <htslib/khash.h>
50
51 #include "samtools.h"
52 #include "sam_opts.h"
53 #include "bam_ampliconclip.h"
54
55 KHASH_MAP_INIT_INT64(tcoord, int64_t)
56 KHASH_MAP_INIT_STR(qname, int64_t)
57
58 #ifndef MIN
59 #define MIN(a,b) ((a)<(b)?(a):(b))
60 #endif
61
62 #ifndef MAX
63 #define MAX(a,b) ((a)>(b)?(a):(b))
64 #endif
65
66 #ifndef ABS
67 #define ABS(a) ((a)>=0?(a):-(a))
68 #endif
69
70 #define TCOORD_MIN_COUNT 10
71 #define MAX_AMP 1000 // Default maximum number of amplicons
72 #define MAX_AMP_LEN 1000 // Default maximum length of any single amplicon
73 #define MAX_PRIMER_PER_AMPLICON 4 // Max primers per LEFT/RIGHT
74 #define MAX_DEPTH 5 // Number of different depths permitted
75
76 typedef struct {
77 sam_global_args ga;
78 uint32_t flag_require;
79 uint32_t flag_filter;
80 int max_delta; // Used for matching read to amplicon primer loc
81 int min_depth[MAX_DEPTH]; // Used for coverage; must be >= min_depth deep
82 int use_sample_name;
83 int max_amp; // Total number of amplicons
84 int max_amp_len; // Maximum length of an individual amplicon
85 double depth_bin;// aggregate depth within this fraction
86 int tlen_adj; // Adjust tlen by this amount, due to clip but no fixmate
87 FILE *out_fp;
88 char *argv;
89 int tcoord_min_count;
90 int tcoord_bin;
91 int multi_ref;
92 } astats_args_t;
93
94 typedef struct {
95 int nseq; // total sequence count
96 int nfiltered; // sequence filtered
97 int nfailprimer;// count of sequences not matching the primer locations
98
99 // Sizes of memory allocated below, to permit reset
100 int max_amp, max_amp_len, max_len;
101
102 // Summary across all samples, sum(x) plus sum(x^2) for s.d. calc
103 int64_t *nreads, *nreads2; // [max_amp]
104 double *nfull_reads; // [max_amp]; 0.5/read if paired.
105 double *nrperc, *nrperc2; // [max_amp]
106 int64_t *nbases, *nbases2; // [max_amp]
107 int64_t *coverage; // [max_amp][max_amp_len]
108 double (*covered_perc)[MAX_DEPTH]; // [max_amp][MAX_DEPTH]
109 double (*covered_perc2)[MAX_DEPTH];// [max_amp][MAX_DEPTH];
110 khash_t(tcoord) **tcoord; // [max_amp+1]
111
112 // 0 is correct pair, 1 is incorrect pair, 2 is unidentified
113 int (*amp_dist)[3]; // [MAX_AMP][3];
114
115 int *depth_valid; // [max_len]
116 int *depth_all; // [max_len]
117 khash_t(qname) *qend; // queryname end, for overlap removal
118 } astats_t;
119
120 // We can have multiple primers for LEFT / RIGHT, so this
121 // permits detection by any compatible combination.
122 // One reference:
123 typedef struct {
124 int64_t left[MAX_PRIMER_PER_AMPLICON];
125 int nleft;
126 int64_t right[MAX_PRIMER_PER_AMPLICON];
127 int nright;
128 int64_t max_left, min_right; // inner dimensions
129 int64_t min_left, max_right; // outer dimensions
130 } amplicon_t;
131
132 // Multiple references, we have an array of amplicons_t - one per used ref.
133 // We have per reference local and global stats here, as some of the stats
134 // are coordinate based. However we report them combined together as a single
135 // list across all references.
136 // "namp" is the number of amplicons in this reference, but they're
137 // numbered first_amp to first_amp+namp-1 inclusively.
138 typedef struct {
139 int tid, namp;
140 int64_t len;
141 bed_entry_list_t *sites;
142 amplicon_t *amp;
143 astats_t *lstats, *gstats; // local (1 file) and global (all file) stats
144 const char *ref; // ref name (pointer to the bed hash table key)
145 int first_amp; // first amplicon number for this ref
146 } amplicons_t;
147
148 // Reinitialised for each new reference/chromosome.
149 // Counts from 1 to namp, -1 for no match and 0 for ?.
150 static int *pos2start = NULL;
151 static int *pos2end = NULL;
152 static int pos2size = 0; // allocated size of pos2start/end
153
154 // Lookup table to go from position to amplicon based on
155 // read start / end.
initialise_amp_pos_lookup(astats_args_t * args,amplicons_t * amps,int ref)156 static int initialise_amp_pos_lookup(astats_args_t *args,
157 amplicons_t *amps,
158 int ref) {
159 int64_t i, j;
160 amplicon_t *amp = amps[ref].amp;
161 int64_t max_len = amps[ref].len;
162 int namp = amps[ref].namp;
163
164 if (max_len+1 > pos2size) {
165 if (!(pos2start = realloc(pos2start, (max_len+1)*sizeof(*pos2start))))
166 return -1;
167 if (!(pos2end = realloc(pos2end, (max_len+1)*sizeof(*pos2end))))
168 return -1;
169 pos2size = max_len;
170 }
171 for (i = 0; i < max_len; i++)
172 pos2start[i] = pos2end[i] = -1;
173
174 for (i = 0; i < namp; i++) {
175 for (j = 0; j < amp[i].nleft; j++) {
176 int64_t p;
177 for (p = amp[i].left[j] - args->max_delta;
178 p <= amp[i].left[j] + args->max_delta; p++) {
179 if (p < 1 || p > max_len)
180 continue;
181 pos2start[p-1] = i;
182 }
183 }
184 for (j = 0; j < amp[i].nright; j++) {
185 int64_t p;
186 for (p = amp[i].right[j] - args->max_delta;
187 p <= amp[i].right[j] + args->max_delta; p++) {
188 if (p < 1 || p > max_len)
189 continue;
190 pos2end[p-1] = i;
191 }
192 }
193 }
194
195 return 0;
196 }
197
198 // Counts amplicons.
199 // Assumption: input BED file alternates between LEFT and RIGHT primers
200 // per amplicon, thus we can count the number based on the switching
201 // orientation.
count_amplicon(bed_entry_list_t * sites)202 static int count_amplicon(bed_entry_list_t *sites) {
203 int i, namp, last_rev = 0;
204 for (i = namp = 0; i < sites->length; i++) {
205 if (sites->bp[i].rev == 0 && last_rev)
206 namp++;
207 last_rev = sites->bp[i].rev;
208 }
209
210 return ++namp;
211 }
212
213 // We're only interest in the internal part of the amplicon.
214 // Our bed file has LEFT start/end followed by RIGHT start/end,
215 // so collapse these to LEFT end / RIGHT start.
216 //
217 // Returns right most amplicon position on success,
218 // < 0 on error
bed2amplicon(astats_args_t * args,bed_entry_list_t * sites,amplicon_t * amp,int * namp,int do_title,const char * ref,int first_amp)219 static int64_t bed2amplicon(astats_args_t *args, bed_entry_list_t *sites,
220 amplicon_t *amp, int *namp, int do_title,
221 const char *ref, int first_amp) {
222 int i, j;
223 int64_t max_right = 0;
224 FILE *ofp = args->out_fp;
225
226 *namp = 0;
227
228 // Assume all primers for the same amplicon are adjacent in BED
229 // with all + followed by all -. Thus - to + signifies next primer set.
230 int last_rev = 0;
231 amp[0].max_left = 0;
232 amp[0].min_right = INT64_MAX;
233 amp[0].min_left = INT64_MAX;
234 amp[0].max_right = 0;
235 if (do_title) {
236 fprintf(ofp, "# Amplicon locations from BED file.\n");
237 fprintf(ofp, "# LEFT/RIGHT are <start>-<end> format and "
238 "comma-separated for alt-primers.\n");
239 if (args->multi_ref)
240 fprintf(ofp, "#\n# AMPLICON\tREF\tNUMBER\tLEFT\tRIGHT\n");
241 else
242 fprintf(ofp, "#\n# AMPLICON\tNUMBER\tLEFT\tRIGHT\n");
243 }
244 for (i = j = 0; i < sites->length; i++) {
245 if (i == 0 && sites->bp[i].rev != 0) {
246 fprintf(samtools_stderr, "[ampliconstats] error: BED file should start"
247 " with the + strand primer\n");
248 return -1;
249 }
250 if (sites->bp[i].rev == 0 && last_rev) {
251 j++;
252 if (j >= args->max_amp) {
253 fprintf(samtools_stderr, "[ampliconstats] error: too many amplicons"
254 " (%d). Use -a option to raise this.\n", j);
255 return -1;
256 }
257 amp[j].max_left = 0;
258 amp[j].min_right = INT64_MAX;
259 amp[j].min_left = INT64_MAX;
260 amp[j].max_right = 0;
261 }
262 if (sites->bp[i].rev == 0) {
263 if (i == 0 || last_rev) {
264 if (j>0) fprintf(ofp, "\n");
265 if (args->multi_ref)
266 fprintf(ofp, "AMPLICON\t%s\t%d", ref, j+1 + first_amp);
267 else
268 fprintf(ofp, "AMPLICON\t%d", j+1);
269 }
270 if (amp[j].nleft >= MAX_PRIMER_PER_AMPLICON) {
271 print_error_errno("ampliconstats",
272 "too many primers per amplicon (%d).\n",
273 MAX_PRIMER_PER_AMPLICON);
274 return -1;
275 }
276 amp[j].left[amp[j].nleft++] = sites->bp[i].right;
277 if (amp[j].max_left < sites->bp[i].right+1)
278 amp[j].max_left = sites->bp[i].right+1;
279 if (amp[j].min_left > sites->bp[i].right+1)
280 amp[j].min_left = sites->bp[i].right+1;
281 // BED file, so left+1 as zero based. right(+1-1) as
282 // BED goes one beyond end (and we want inclusive range).
283 fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nleft > 1],
284 sites->bp[i].left+1, sites->bp[i].right);
285 } else {
286 if (amp[j].nright >= MAX_PRIMER_PER_AMPLICON) {
287 print_error_errno("ampliconstats",
288 "too many primers per amplicon (%d)",
289 MAX_PRIMER_PER_AMPLICON);
290 return -1;
291 }
292 amp[j].right[amp[j].nright++] = sites->bp[i].left;
293 if (amp[j].min_right > sites->bp[i].left-1)
294 amp[j].min_right = sites->bp[i].left-1;
295 if (amp[j].max_right < sites->bp[i].left-1) {
296 amp[j].max_right = sites->bp[i].left-1;
297 if (amp[j].max_right - amp[j].min_left + 1 >=
298 args->max_amp_len) {
299 fprintf(samtools_stderr, "[ampliconstats] error: amplicon "
300 "longer (%d) than max_amp_len option (%d)\n",
301 (int)(amp[j].max_right - amp[j].min_left + 2),
302 args->max_amp_len);
303 return -1;
304 }
305 if (max_right < amp[j].max_right)
306 max_right = amp[j].max_right;
307 }
308 fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nright > 1],
309 sites->bp[i].left+1, sites->bp[i].right);
310 }
311 last_rev = sites->bp[i].rev;
312 }
313 if (last_rev != 1) {
314 fprintf(ofp, "\n"); // useful if going to samtools_stdout
315 fprintf(samtools_stderr, "[ampliconstats] error: bed file does not end on"
316 " a reverse strand primer.\n");
317 return -1;
318 }
319 *namp = ++j;
320 if (j) fprintf(ofp, "\n");
321
322 if (j >= args->max_amp) {
323 fprintf(samtools_stderr, "[ampliconstats] error: "
324 "too many amplicons (%d). Use -a option to raise this.", j);
325 return -1;
326 }
327
328 // for (i = 0; i < *namp; i++) {
329 // fprintf(samtools_stdout, "%d\t%ld", i, amp[i].length);
330 // for (j = 0; j < amp[i].nleft; j++)
331 // fprintf(samtools_stdout, "%c%ld", "\t,"[j>0], amp[i].left[j]);
332 // for (j = 0; j < amp[i].nright; j++)
333 // fprintf(samtools_stdout, "%c%ld", "\t,"[j>0], amp[i].right[j]);
334 // fprintf(samtools_stdout, "\n");
335 // }
336
337 return max_right;
338 }
339
stats_free(astats_t * st)340 void stats_free(astats_t *st) {
341 if (!st)
342 return;
343
344 free(st->nreads);
345 free(st->nreads2);
346 free(st->nfull_reads);
347 free(st->nrperc);
348 free(st->nrperc2);
349 free(st->nbases);
350 free(st->nbases2);
351 free(st->coverage);
352 free(st->covered_perc);
353 free(st->covered_perc2);
354 free(st->amp_dist);
355
356 free(st->depth_valid);
357 free(st->depth_all);
358
359 if (st->tcoord) {
360 int i;
361 for (i = 0; i <= st->max_amp; i++) {
362 if (st->tcoord[i])
363 kh_destroy(tcoord, st->tcoord[i]);
364 }
365 free(st->tcoord);
366 }
367
368 khiter_t k;
369 for (k = kh_begin(st->qend); k != kh_end(st->qend); k++)
370 if (kh_exist(st->qend, k))
371 free((void *)kh_key(st->qend, k));
372 kh_destroy(qname, st->qend);
373
374 free(st);
375 }
376
stats_alloc(int64_t max_len,int max_amp,int max_amp_len)377 astats_t *stats_alloc(int64_t max_len, int max_amp, int max_amp_len) {
378 astats_t *st = calloc(1, sizeof(*st));
379 if (!st)
380 return NULL;
381
382 st->max_amp = max_amp;
383 st->max_amp_len = max_amp_len;
384 st->max_len = max_len;
385
386 if (!(st->nreads = calloc(max_amp, sizeof(*st->nreads)))) goto err;
387 if (!(st->nreads2 = calloc(max_amp, sizeof(*st->nreads2)))) goto err;
388 if (!(st->nrperc = calloc(max_amp, sizeof(*st->nrperc)))) goto err;
389 if (!(st->nrperc2 = calloc(max_amp, sizeof(*st->nrperc2)))) goto err;
390 if (!(st->nbases = calloc(max_amp, sizeof(*st->nbases)))) goto err;
391 if (!(st->nbases2 = calloc(max_amp, sizeof(*st->nbases2)))) goto err;
392
393 if (!(st->nfull_reads = calloc(max_amp, sizeof(*st->nfull_reads))))
394 goto err;
395
396 if (!(st->coverage = calloc(max_amp*max_amp_len, sizeof(*st->coverage))))
397 goto err;
398
399 if (!(st->covered_perc = calloc(max_amp, sizeof(*st->covered_perc))))
400 goto err;
401 if (!(st->covered_perc2 = calloc(max_amp, sizeof(*st->covered_perc2))))
402 goto err;
403
404 if (!(st->tcoord = calloc(max_amp+1, sizeof(*st->tcoord)))) goto err;
405 int i;
406 for (i = 0; i <= st->max_amp; i++)
407 if (!(st->tcoord[i] = kh_init(tcoord)))
408 goto err;
409
410 if (!(st->qend = kh_init(qname)))
411 goto err;
412
413 if (!(st->depth_valid = calloc(max_len, sizeof(*st->depth_valid))))
414 goto err;
415 if (!(st->depth_all = calloc(max_len, sizeof(*st->depth_all))))
416 goto err;
417
418 if (!(st->amp_dist = calloc(max_amp, sizeof(*st->amp_dist)))) goto err;
419
420 return st;
421
422 err:
423 stats_free(st);
424 return NULL;
425 }
426
stats_reset(astats_t * st)427 static void stats_reset(astats_t *st) {
428 st->nseq = 0;
429 st->nfiltered = 0;
430 st->nfailprimer = 0;
431
432 memset(st->nreads, 0, st->max_amp * sizeof(*st->nreads));
433 memset(st->nreads2, 0, st->max_amp * sizeof(*st->nreads2));
434 memset(st->nfull_reads, 0, st->max_amp * sizeof(*st->nfull_reads));
435
436 memset(st->nrperc, 0, st->max_amp * sizeof(*st->nrperc));
437 memset(st->nrperc2, 0, st->max_amp * sizeof(*st->nrperc2));
438
439 memset(st->nbases, 0, st->max_amp * sizeof(*st->nbases));
440 memset(st->nbases2, 0, st->max_amp * sizeof(*st->nbases2));
441
442 memset(st->coverage, 0, st->max_amp * st->max_amp_len
443 * sizeof(*st->coverage));
444 memset(st->covered_perc, 0, st->max_amp * sizeof(*st->covered_perc));
445 memset(st->covered_perc2, 0, st->max_amp * sizeof(*st->covered_perc2));
446
447 // Keep the allocated entries as it's likely all files will share
448 // the same keys. Instead we reset counters to zero for common ones
449 // and delete rare ones.
450 int i;
451 for (i = 0; i <= st->max_amp; i++) {
452 khiter_t k;
453 for (k = kh_begin(st->tcoord[i]);
454 k != kh_end(st->tcoord[i]); k++)
455 if (kh_exist(st->tcoord[i], k)) {
456 if (kh_value(st->tcoord[i], k) < 5)
457 kh_del(tcoord, st->tcoord[i], k);
458 else
459 kh_value(st->tcoord[i], k) = 0;
460 }
461 }
462
463 khiter_t k;
464 for (k = kh_begin(st->qend); k != kh_end(st->qend); k++)
465 if (kh_exist(st->qend, k))
466 free((void *)kh_key(st->qend, k));
467 kh_clear(qname, st->qend);
468
469 memset(st->depth_valid, 0, st->max_len * sizeof(*st->depth_valid));
470 memset(st->depth_all, 0, st->max_len * sizeof(*st->depth_all));
471 memset(st->amp_dist, 0, st->max_amp * sizeof(*st->amp_dist));
472 }
473
amp_stats_reset(amplicons_t * amps,int nref)474 static void amp_stats_reset(amplicons_t *amps, int nref) {
475 int i;
476 for (i = 0; i < nref; i++) {
477 if (!amps[i].sites)
478 continue;
479 stats_reset(amps[i].lstats);
480 }
481 }
482
accumulate_stats(astats_args_t * args,amplicons_t * amps,bam1_t * b)483 static int accumulate_stats(astats_args_t *args, amplicons_t *amps,
484 bam1_t *b) {
485 int ref = b->core.tid;
486 amplicon_t *amp = amps[ref].amp;
487 astats_t *stats = amps[ref].lstats;
488 int len = amps[ref].len;
489
490 if (!stats)
491 return 0;
492
493 stats->nseq++;
494 if ((b->core.flag & args->flag_require) != args->flag_require ||
495 (b->core.flag & args->flag_filter) != 0) {
496 stats->nfiltered++;
497 return 0;
498 }
499
500 int64_t start = b->core.pos, mstart = start; // modified start
501 int64_t end = bam_endpos(b), i;
502
503 // Compute all-template-depth and valid-template-depth.
504 // We track current end location per read name so we can remove overlaps.
505 // Potentially we could use this data for a better amplicon-depth
506 // count too, but for now it's purely for the per-base plots.
507 int ret;
508 khiter_t k;
509 int prev_start = 0, prev_end = 0;
510 if ((b->core.flag & BAM_FPAIRED)
511 && !(b->core.flag & (BAM_FSUPPLEMENTARY | BAM_FSECONDARY))) {
512 k = kh_put(qname, stats->qend, bam_get_qname(b), &ret);
513 if (ret == 0) {
514 prev_start = kh_value(stats->qend, k) & 0xffffffff;
515 prev_end = kh_value(stats->qend, k)>>32;
516 mstart = MAX(mstart, prev_end);
517 // Ideally we'd reuse strings so we don't thrash free/malloc.
518 // However let's see if the official way of doing that (malloc
519 // itself) is fast enough first.
520 free((void *)kh_key(stats->qend, k));
521 kh_del(qname, stats->qend, k);
522 //fprintf(samtools_stderr, "remove overlap %d to %d\n", (int)start, (int)mstart);
523 } else {
524 if (!(kh_key(stats->qend, k) = strdup(bam_get_qname(b))))
525 return -1;
526
527 kh_value(stats->qend, k) = start | (end << 32);
528 }
529 }
530 for (i = mstart; i < end && i < len; i++)
531 stats->depth_all[i]++;
532 if (i < end) {
533 print_error("ampliconstats", "record %s overhangs end of reference",
534 bam_get_qname(b));
535 // But keep going, as it's harmless.
536 }
537
538 // On single ended runs, eg ONT or PacBio, we just use the start/end
539 // of the template to assign.
540 int anum = (b->core.flag & BAM_FREVERSE) || !(b->core.flag & BAM_FPAIRED)
541 ? (end-1 >= 0 && end-1 < len ? pos2end[end-1] : -1)
542 : (start >= 0 && start < len ? pos2start[start] : -1);
543
544 // ivar sometimes soft-clips 100% of the bases.
545 // This is essentially unmapped
546 if (end == start && (args->flag_filter & BAM_FUNMAP)) {
547 stats->nfiltered++;
548 return 0;
549 }
550
551 if (anum == -1)
552 stats->nfailprimer++;
553
554 if (anum >= 0) {
555 int64_t c = MIN(end,amp[anum].min_right+1) - MAX(start,amp[anum].max_left);
556 if (c > 0) {
557 stats->nreads[anum]++;
558 // NB: ref bases rather than read bases
559 stats->nbases[anum] += c;
560
561 int64_t i;
562 if (start < 0) start = 0;
563 if (end > len) end = len;
564
565 int64_t ostart = MAX(start, amp[anum].min_left-1);
566 int64_t oend = MIN(end, amp[anum].max_right);
567 int64_t offset = amp[anum].min_left-1;
568 for (i = ostart; i < oend; i++)
569 stats->coverage[anum*stats->max_amp_len + i-offset]++;
570 } else {
571 stats->nfailprimer++;
572 }
573 }
574
575 // Template length in terms of amplicon number to amplicon number.
576 // We expect left to right of same amplicon (len 0), but it may go
577 // to next amplicon (len 1) or prev (len -1), etc.
578 int64_t t_end;
579 int oth_anum = -1;
580
581 if (b->core.flag & BAM_FPAIRED) {
582 t_end = (b->core.flag & BAM_FREVERSE ? end : start)
583 + b->core.isize;
584
585 // If we've clipped the primers but not followed up with a fixmates
586 // then our start+TLEN will take us to a location which is
587 // length(LEFT_PRIMER) + length(RIGHT_PRIMER) too far away.
588 //
589 // The correct solution is to run samtools fixmate so TLEN is correct.
590 // The hacky solution is to fudge the expected tlen by double the
591 // average primer length (e.g. 50).
592 t_end += b->core.isize > 0 ? -args->tlen_adj : +args->tlen_adj;
593
594 if (t_end > 0 && t_end < len && b->core.isize != 0)
595 oth_anum = (b->core.flag & BAM_FREVERSE)
596 ? pos2start[t_end]
597 : pos2end[t_end];
598 } else {
599 // Not paired (see int anum = (REV || !PAIR) ?en :st expr above)
600 oth_anum = pos2start[start];
601 t_end = end;
602 }
603
604 // We don't want to count our pairs twice.
605 // If both left/right are known, count it on left only.
606 // If only one is known, we'll only get to this code once
607 // so we can also count it.
608 int astatus = 2;
609 if (anum != -1 && oth_anum != -1) {
610 astatus = oth_anum == anum ? 0 : 1;
611 if (start <= t_end)
612 stats->amp_dist[anum][astatus]++;
613 } else if (anum >= 0) {
614 stats->amp_dist[anum][astatus = 2]++;
615 }
616
617 if (astatus == 0 && !(b->core.flag & (BAM_FUNMAP | BAM_FMUNMAP))) {
618 if (prev_end && mstart > prev_end) {
619 // 2nd read with gap to 1st; undo previous increment.
620 for (i = prev_start; i < prev_end; i++)
621 stats->depth_valid[i]--;
622 stats->nfull_reads[anum] -= (b->core.flag & BAM_FPAIRED) ? 0.5 : 1;
623 } else {
624 // 1st read, or 2nd read that overlaps 1st
625 for (i = mstart; i < end; i++)
626 stats->depth_valid[i]++;
627 stats->nfull_reads[anum] += (b->core.flag & BAM_FPAIRED) ? 0.5 : 1;
628 }
629 }
630
631 // Track template start,end frequencies, so we can give stats on
632 // amplicon primer usage.
633 if ((b->core.flag & BAM_FPAIRED) && b->core.isize <= 0)
634 // left to right only, so we don't double count template positions.
635 return 0;
636
637 start = b->core.pos;
638 t_end = b->core.flag & BAM_FPAIRED
639 ? start + b->core.isize-1
640 : end;
641 uint64_t tcoord = MIN(start+1, UINT32_MAX) | (MIN(t_end+1, UINT32_MAX)<<32);
642 k = kh_put(tcoord, stats->tcoord[anum+1], tcoord, &ret);
643 if (ret < 0)
644 return -1;
645 if (ret == 0)
646 kh_value(stats->tcoord[anum+1], k)++;
647 else
648 kh_value(stats->tcoord[anum+1], k)=1;
649 kh_value(stats->tcoord[anum+1], k) |= ((int64_t)astatus<<32);
650
651 return 0;
652 }
653
654 // Append file local stats to global stats
append_lstats(astats_t * lstats,astats_t * gstats,int namp,int all_nseq)655 int append_lstats(astats_t *lstats, astats_t *gstats, int namp, int all_nseq) {
656 gstats->nseq += lstats->nseq;
657 gstats->nfiltered += lstats->nfiltered;
658 gstats->nfailprimer += lstats->nfailprimer;
659
660 int a;
661 for (a = -1; a < namp; a++) {
662 // Add khash local (kl) to khash global (kg)
663 khiter_t kl, kg;
664 for (kl = kh_begin(lstats->tcoord[a+1]);
665 kl != kh_end(lstats->tcoord[a+1]); kl++) {
666 if (!kh_exist(lstats->tcoord[a+1], kl) ||
667 kh_value(lstats->tcoord[a+1], kl) == 0)
668 continue;
669
670 int ret;
671 kg = kh_put(tcoord, gstats->tcoord[a+1],
672 kh_key(lstats->tcoord[a+1], kl),
673 &ret);
674 if (ret < 0)
675 return -1;
676
677 kh_value(gstats->tcoord[a+1], kg) =
678 (ret == 0
679 ? (kh_value(gstats->tcoord[a+1], kg) & 0xFFFFFFFF)
680 : 0)
681 + kh_value(lstats->tcoord[a+1], kl);
682 }
683 if (a == -1) continue;
684
685 gstats->nreads[a] += lstats->nreads[a];
686 gstats->nreads2[a] += lstats->nreads[a] * lstats->nreads[a];
687 gstats->nfull_reads[a] += lstats->nfull_reads[a];
688
689 // To get mean & sd for amplicon read percentage, we need
690 // to do the divisions here as nseq differs for each sample.
691 double nrperc = all_nseq ? 100.0 * lstats->nreads[a] / all_nseq : 0;
692 gstats->nrperc[a] += nrperc;
693 gstats->nrperc2[a] += nrperc*nrperc;
694
695 gstats->nbases[a] += lstats->nbases[a];
696 gstats->nbases2[a] += lstats->nbases[a] * lstats->nbases[a];
697
698 int d;
699 for (d = 0; d < MAX_DEPTH; d++) {
700 gstats->covered_perc[a][d] += lstats->covered_perc[a][d];
701 gstats->covered_perc2[a][d] += lstats->covered_perc[a][d]
702 * lstats->covered_perc[a][d];
703 }
704
705 for (d = 0; d < 3; d++)
706 gstats->amp_dist[a][d] += lstats->amp_dist[a][d];
707 }
708
709 for (a = 0; a < lstats->max_len; a++) {
710 gstats->depth_valid[a] += lstats->depth_valid[a];
711 gstats->depth_all[a] += lstats->depth_all[a];
712 }
713
714 return 0;
715 }
716
append_stats(amplicons_t * amps,int nref)717 int append_stats(amplicons_t *amps, int nref) {
718 int i, r, all_nseq = 0;
719 for (r = 0; r < nref; r++) {
720 if (!amps[r].sites)
721 continue;
722 astats_t *stats = amps[r].lstats;
723 all_nseq += stats->nseq - stats->nfiltered - stats->nfailprimer;
724 }
725
726 for (i = 0; i < nref; i++) {
727 if (!amps[i].sites)
728 continue;
729 if (append_lstats(amps[i].lstats, amps[i].gstats, amps[i].namp,
730 all_nseq) < 0)
731 return -1;
732 }
733
734 return 0;
735 }
736
737 typedef struct {
738 int32_t start, end;
739 uint32_t freq;
740 uint32_t status;
741 } tcoord_t;
742
743 // Sort tcoord by descending frequency and then ascending start and end.
tcoord_freq_sort(const void * vp1,const void * vp2)744 static int tcoord_freq_sort(const void *vp1, const void *vp2) {
745 const tcoord_t *t1 = (const tcoord_t *)vp1;
746 const tcoord_t *t2 = (const tcoord_t *)vp2;
747
748 if (t1->freq != t2->freq)
749 return t2->freq - t1->freq;
750
751 if (t1->start != t2->start)
752 return t1->start - t2->start;
753
754 return t1->end - t2->end;
755 }
756
757
758 /*
759 * Merges tcoord start,end,freq,status tuples if their coordinates are
760 * close together. We aim to keep the start,end for the most frequent
761 * value and assume that is the correct coordinate and all others are
762 * minor fluctuations due to errors or variants.
763 *
764 * We sort by frequency first and then merge later items in the list into
765 * the earlier more frequent ones. It's O(N^2), but sufficient for now
766 * given current scale of projects.
767 *
768 * If we ever need to resolve that then consider sorting by start
769 * coordinate and scanning the list to find all items within X, find
770 * the most frequent of those, and then cluster that way. (I'd have
771 * done that had I thought of it at the time!)
772 */
aggregate_tcoord(astats_args_t * args,tcoord_t * tpos,size_t * np)773 static void aggregate_tcoord(astats_args_t *args, tcoord_t *tpos, size_t *np){
774 size_t n = *np, j, j2, j3, k;
775
776 // Sort by frequency and cluster infrequent coords into frequent
777 // ones provided they're close by.
778 // This is O(N^2), but we've already binned by tcoord_bin/2 so
779 // the list isn't intended to be vast at this point.
780 qsort(tpos, n, sizeof(*tpos), tcoord_freq_sort);
781
782 // For frequency ties, find mid start coord, and then find mid end
783 // coord of those matching start.
784 // We make that the first item so we merge into that mid point.
785 for (j = 0; j < n; j++) {
786 for (j2 = j+1; j2 < n; j2++) {
787 if (tpos[j].freq != tpos[j2].freq)
788 break;
789 if (tpos[j2].start - tpos[j].start >= args->tcoord_bin)
790 break;
791 }
792
793 // j to j2 all within bin of a common start,
794 // m is the mid start.
795 if (j2-1 > j) {
796 size_t m = (j2-1 + j)/2;
797
798 // Find mid end for this same start
799 while (m > 1 && tpos[m].start == tpos[m-1].start)
800 m--;
801 for (j3 = m+1; j3 < j2; j3++) {
802 if (tpos[m].start != tpos[j3].start)
803 break;
804 if (tpos[m].end - tpos[j3].end >= args->tcoord_bin)
805 break;
806 }
807 if (j3-1 > m)
808 m = (j3-1 + m)/2;
809
810 // Swap with first item.
811 tcoord_t tmp = tpos[j];
812 tpos[j] = tpos[m];
813 tpos[m] = tmp;
814 j = j2-1;
815 }
816 }
817
818 // Now merge in coordinates.
819 // This bit is O(N^2), so consider binning first to reduce the
820 // size of the list if we have excessive positional variation.
821 for (k = j = 0; j < n; j++) {
822 if (!tpos[j].freq)
823 continue;
824
825 if (k < j)
826 tpos[k] = tpos[j];
827
828 for (j2 = j+1; j2 < n; j2++) {
829 if (ABS(tpos[j].start-tpos[j2].start) < args->tcoord_bin/2 &&
830 ABS(tpos[j].end -tpos[j2].end) < args->tcoord_bin/2 &&
831 tpos[j].status == tpos[j2].status) {
832 tpos[k].freq += tpos[j2].freq;
833 tpos[j2].freq = 0;
834 }
835 }
836 k++;
837 }
838
839 *np = k;
840 }
841
dump_stats(astats_args_t * args,char type,char * name,int nfile,amplicons_t * amps,int nref,int local)842 int dump_stats(astats_args_t *args, char type, char *name, int nfile,
843 amplicons_t *amps, int nref, int local) {
844 int i, r;
845 FILE *ofp = args->out_fp;
846 tcoord_t *tpos = NULL;
847 size_t ntcoord = 0;
848
849 // summary stats for this sample (or for all samples)
850 fprintf(ofp, "# Summary stats.\n");
851 fprintf(ofp, "# Use 'grep ^%cSS | cut -f 2-' to extract this part.\n", type);
852
853 for (r = 0; r < nref; r++) {
854 if (!amps[r].sites)
855 continue;
856 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
857 int nmatch = stats->nseq - stats->nfiltered - stats->nfailprimer;
858 char *name_ref = malloc(strlen(name) + strlen(amps[r].ref) + 2);
859 if (!name_ref)
860 return -1;
861 if (args->multi_ref)
862 sprintf(name_ref, "%s\t%s", name, amps[r].ref);
863 else
864 sprintf(name_ref, "%s", name);
865 fprintf(ofp, "%cSS\t%s\traw total sequences:\t%d\n",
866 type, name_ref, stats->nseq);
867 fprintf(ofp, "%cSS\t%s\tfiltered sequences:\t%d\n",
868 type, name_ref, stats->nfiltered);
869 fprintf(ofp, "%cSS\t%s\tfailed primer match:\t%d\n",
870 type, name_ref, stats->nfailprimer);
871 fprintf(ofp, "%cSS\t%s\tmatching sequences:\t%d\n",
872 type, name_ref, nmatch);
873
874 int d = 0;
875 do {
876 // From first to last amplicon only, so not entire consensus.
877 // If contig length is known, maybe we want to add the missing
878 // count to < DEPTH figures?
879 int64_t start = 0, covered = 0, total = 0;
880 amplicon_t *amp = amps[r].amp;
881 for (i = 0; i < amps[r].namp; i++) {
882 int64_t j, offset = amp[i].min_left-1;
883 if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) {
884 fprintf(samtools_stderr, "[ampliconstats] error: "
885 "Maximum amplicon length (%d) exceeded for '%s'\n",
886 stats->max_amp, name);
887 return -1;
888 }
889 for (j = MAX(start, amp[i].max_left-1);
890 j < MAX(start, amp[i].min_right); j++) {
891 if (stats->coverage[i*stats->max_amp_len + j-offset]
892 >= args->min_depth[d])
893 covered++;
894 total++;
895 }
896 start = MAX(start, amp[i].min_right);
897 }
898 fprintf(ofp, "%cSS\t%s\tconsensus depth count < %d and >= %d:\t%"
899 PRId64"\t%"PRId64"\n", type, name_ref,
900 args->min_depth[d], args->min_depth[d],
901 total-covered, covered);
902 } while (++d < MAX_DEPTH && args->min_depth[d]);
903
904 free(name_ref);
905 }
906
907 // Read count
908 fprintf(ofp, "# Absolute matching read counts per amplicon.\n");
909 fprintf(ofp, "# Use 'grep ^%cREADS | cut -f 2-' to extract this part.\n", type);
910 fprintf(ofp, "%cREADS\t%s", type, name);
911 for (r = 0; r < nref; r++) {
912 if (!amps[r].sites)
913 continue;
914 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
915 for (i = 0; i < amps[r].namp; i++) {
916 fprintf(ofp, "\t%"PRId64, stats->nreads[i]);
917 }
918 }
919 fprintf(ofp, "\n");
920
921 // Valid depth is the number of full length reads (already divided
922 // by the number we expect to cover), so +0.5 per read in pair.
923 // A.k.a "usable depth" in the plots.
924 fprintf(ofp, "%cVDEPTH\t%s", type, name);
925 for (r = 0; r < nref; r++) {
926 if (!amps[r].sites)
927 continue;
928 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
929 for (i = 0; i < amps[r].namp; i++)
930 fprintf(ofp, "\t%d", (int)stats->nfull_reads[i]);
931 }
932 fprintf(ofp, "\n");
933
934 if (type == 'C') {
935 // For combined we can compute mean & standard deviation too
936 fprintf(ofp, "CREADS\tMEAN");
937 for (r = 0; r < nref; r++) {
938 if (!amps[r].sites)
939 continue;
940 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
941 for (i = 0; i < amps[r].namp; i++) {
942 fprintf(ofp, "\t%.1f", stats->nreads[i] / (double)nfile);
943 }
944 }
945 fprintf(ofp, "\n");
946
947 fprintf(ofp, "CREADS\tSTDDEV");
948 for (r = 0; r < nref; r++) {
949 if (!amps[r].sites)
950 continue;
951 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
952 for (i = 0; i < amps[r].namp; i++) {
953 double n1 = stats->nreads[i];
954 fprintf(ofp, "\t%.1f", nfile > 1 && stats->nreads2[i] > 0
955 ? sqrt(stats->nreads2[i]/(double)nfile
956 - (n1/nfile)*(n1/nfile))
957 : 0);
958 }
959 }
960 fprintf(ofp, "\n");
961 }
962
963 fprintf(ofp, "# Read percentage of distribution between amplicons.\n");
964 fprintf(ofp, "# Use 'grep ^%cRPERC | cut -f 2-' to extract this part.\n", type);
965 fprintf(ofp, "%cRPERC\t%s", type, name);
966 int all_nseq = 0;
967 for (r = 0; r < nref; r++) {
968 if (!amps[r].sites)
969 continue;
970 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
971 all_nseq += stats->nseq - stats->nfiltered - stats->nfailprimer;
972 }
973 for (r = 0; r < nref; r++) {
974 if (!amps[r].sites)
975 continue;
976 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
977 for (i = 0; i < amps[r].namp; i++) {
978 if (type == 'C') {
979 fprintf(ofp, "\t%.3f", (double)stats->nrperc[i] / nfile);
980 } else {
981 fprintf(ofp, "\t%.3f",
982 all_nseq ? 100.0 * stats->nreads[i] / all_nseq : 0);
983 }
984 }
985 }
986 fprintf(ofp, "\n");
987
988 if (type == 'C') {
989 // For combined we compute mean and standard deviation too
990 fprintf(ofp, "CRPERC\tMEAN");
991 for (r = 0; r < nref; r++) {
992 if (!amps[r].sites)
993 continue;
994 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
995 for (i = 0; i < amps[r].namp; i++) {
996 fprintf(ofp, "\t%.3f", stats->nrperc[i] / nfile);
997 }
998 }
999 fprintf(ofp, "\n");
1000
1001 fprintf(ofp, "CRPERC\tSTDDEV");
1002 for (r = 0; r < nref; r++) {
1003 if (!amps[r].sites)
1004 continue;
1005 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1006 for (i = 0; i < amps[r].namp; i++) {
1007 // variance = SUM(X^2) - ((SUM(X)^2) / N)
1008 double n1 = stats->nrperc[i];
1009 double v = stats->nrperc2[i]/nfile - (n1/nfile)*(n1/nfile);
1010 fprintf(ofp, "\t%.3f", v>0?sqrt(v):0);
1011 }
1012 }
1013 fprintf(ofp, "\n");
1014 }
1015
1016 // Base depth
1017 fprintf(ofp, "# Read depth per amplicon.\n");
1018 fprintf(ofp, "# Use 'grep ^%cDEPTH | cut -f 2-' to extract this part.\n", type);
1019 fprintf(ofp, "%cDEPTH\t%s", type, name);
1020 for (r = 0; r < nref; r++) {
1021 if (!amps[r].sites)
1022 continue;
1023 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1024 amplicon_t *amp = amps[r].amp;
1025 for (i = 0; i < amps[r].namp; i++) {
1026 int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer;
1027 int64_t alen = amp[i].min_right - amp[i].max_left+1;
1028 fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen : 0);
1029 }
1030 }
1031 fprintf(ofp, "\n");
1032
1033 if (type == 'C') {
1034 // For combined we can compute mean & standard deviation too
1035 fprintf(ofp, "CDEPTH\tMEAN");
1036 for (r = 0; r < nref; r++) {
1037 if (!amps[r].sites)
1038 continue;
1039 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1040 amplicon_t *amp = amps[r].amp;
1041 int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer;
1042 for (i = 0; i < amps[r].namp; i++) {
1043 int64_t alen = amp[i].min_right - amp[i].max_left+1;
1044 fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen / nfile : 0);
1045 }
1046 }
1047 fprintf(ofp, "\n");
1048
1049 fprintf(ofp, "CDEPTH\tSTDDEV");
1050 for (r = 0; r < nref; r++) {
1051 if (!amps[r].sites)
1052 continue;
1053 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1054 amplicon_t *amp = amps[r].amp;
1055 for (i = 0; i < amps[r].namp; i++) {
1056 double alen = amp[i].min_right - amp[i].max_left+1;
1057 double n1 = stats->nbases[i] / alen;
1058 double v = stats->nbases2[i] / (alen*alen) /nfile
1059 - (n1/nfile)*(n1/nfile);
1060 fprintf(ofp, "\t%.1f", v>0?sqrt(v):0);
1061 }
1062 }
1063 fprintf(ofp, "\n");
1064 }
1065
1066 // Percent Coverage
1067 if (type == 'F') {
1068 fprintf(ofp, "# Percentage coverage per amplicon\n");
1069 fprintf(ofp, "# Use 'grep ^%cPCOV | cut -f 2-' to extract this part.\n", type);
1070 int d = 0;
1071 do {
1072 fprintf(ofp, "%cPCOV-%d\t%s", type, args->min_depth[d], name);
1073
1074 for (r = 0; r < nref; r++) {
1075 if (!amps[r].sites)
1076 continue;
1077 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1078 amplicon_t *amp = amps[r].amp;
1079 for (i = 0; i < amps[r].namp; i++) {
1080 int covered = 0;
1081 if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) {
1082 fprintf(samtools_stderr, "[ampliconstats] error: "
1083 "Maximum amplicon length (%d) exceeded for '%s'\n",
1084 stats->max_amp, name);
1085 return -1;
1086 }
1087 int64_t j, offset = amp[i].min_left-1;
1088 for (j = amp[i].max_left-1; j < amp[i].min_right; j++) {
1089 int apos = i*stats->max_amp_len + j-offset;
1090 if (stats->coverage[apos] >= args->min_depth[d])
1091 covered++;
1092 }
1093 int64_t alen = amp[i].min_right - amp[i].max_left+1;
1094 stats->covered_perc[i][d] = 100.0 * covered / alen;
1095 fprintf(ofp, "\t%.2f", 100.0 * covered / alen);
1096 }
1097 }
1098 fprintf(ofp, "\n");
1099 } while (++d < MAX_DEPTH && args->min_depth[d]);
1100
1101 } else if (type == 'C') {
1102 // For combined we can compute mean & standard deviation too
1103 int d = 0;
1104 do {
1105 fprintf(ofp, "CPCOV-%d\tMEAN", args->min_depth[d]);
1106 for (r = 0; r < nref; r++) {
1107 if (!amps[r].sites)
1108 continue;
1109 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1110 for (i = 0; i < amps[r].namp; i++) {
1111 fprintf(ofp, "\t%.1f", stats->covered_perc[i][d] / nfile);
1112 }
1113 }
1114 fprintf(ofp, "\n");
1115
1116 fprintf(ofp, "CPCOV-%d\tSTDDEV", args->min_depth[d]);
1117 for (r = 0; r < nref; r++) {
1118 if (!amps[r].sites)
1119 continue;
1120 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1121 for (i = 0; i < amps[r].namp; i++) {
1122 double n1 = stats->covered_perc[i][d] / nfile;
1123 double v = stats->covered_perc2[i][d] / nfile - n1*n1;
1124 fprintf(ofp, "\t%.1f", v>0?sqrt(v):0);
1125 }
1126 }
1127 fprintf(ofp, "\n");
1128 } while (++d < MAX_DEPTH && args->min_depth[d]);
1129 }
1130
1131 // Plus base depth for all reads, irrespective of amplicon.
1132 // This is post overlap removal, if reads in the read-pair overlap.
1133 fprintf(ofp, "# Depth per reference base for ALL data.\n");
1134 fprintf(ofp, "# Use 'grep ^%cDP_ALL | cut -f 2-' to extract this part.\n",
1135 type);
1136 for (r = 0; r < nref; r++) {
1137 if (!amps[r].sites)
1138 continue;
1139 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1140 if (args->multi_ref)
1141 fprintf(ofp, "%cDP_ALL\t%s\t%s", type, name, amps[r].ref);
1142 else
1143 fprintf(ofp, "%cDP_ALL\t%s", type, name);
1144
1145 for (i = 0; i < amps[r].len; i++) {
1146 // Basic run-length encoding provided all values are within
1147 // +- depth_bin fraction of the mid-point.
1148 int dmin = stats->depth_all[i], dmax = stats->depth_all[i], j;
1149 double dmid = (dmin + dmax)/2.0;
1150 double low = dmid*(1-args->depth_bin);
1151 double high = dmid*(1+args->depth_bin);
1152 for (j = i+1; j < amps[r].len; j++) {
1153 int d = stats->depth_all[j];
1154 if (d < low || d > high)
1155 break;
1156 if (dmin > d) {
1157 dmin = d;
1158 dmid = (dmin + dmax)/2.0;
1159 low = dmid*(1-args->depth_bin);
1160 high = dmid*(1+args->depth_bin);
1161 } else if (dmax < d) {
1162 dmax = d;
1163 dmid = (dmin + dmax)/2.0;
1164 low = dmid*(1-args->depth_bin);
1165 high = dmid*(1+args->depth_bin);
1166 }
1167 }
1168 fprintf(ofp, "\t%d,%d", (int)dmid, j-i);
1169 i = j-1;
1170 }
1171 fprintf(ofp, "\n");
1172 }
1173
1174 // And depth for only reads matching to a single amplicon for full
1175 // length. This is post read overlap removal.
1176 fprintf(ofp, "# Depth per reference base for full-length valid amplicon data.\n");
1177 fprintf(ofp, "# Use 'grep ^%cDP_VALID | cut -f 2-' to extract this "
1178 "part.\n", type);
1179 for (r = 0; r < nref; r++) {
1180 if (!amps[r].sites)
1181 continue;
1182 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1183 if (args->multi_ref)
1184 fprintf(ofp, "%cDP_VALID\t%s\t%s", type, name, amps[r].ref);
1185 else
1186 fprintf(ofp, "%cDP_VALID\t%s", type, name);
1187
1188 for (i = 0; i < amps[r].len; i++) {
1189 int dmin = stats->depth_valid[i], dmax = stats->depth_valid[i], j;
1190 double dmid = (dmin + dmax)/2.0;
1191 double low = dmid*(1-args->depth_bin);
1192 double high = dmid*(1+args->depth_bin);
1193 for (j = i+1; j < amps[r].len; j++) {
1194 int d = stats->depth_valid[j];
1195 if (d < low || d > high)
1196 break;
1197 if (dmin > d) {
1198 dmin = d;
1199 dmid = (dmin + dmax)/2.0;
1200 low = dmid*(1-args->depth_bin);
1201 high = dmid*(1+args->depth_bin);
1202 } else if (dmax < d) {
1203 dmax = d;
1204 dmid = (dmin + dmax)/2.0;
1205 low = dmid*(1-args->depth_bin);
1206 high = dmid*(1+args->depth_bin);
1207 }
1208 }
1209 fprintf(ofp, "\t%d,%d", (int)dmid, j-i);
1210 i = j-1;
1211 }
1212 fprintf(ofp, "\n");
1213 }
1214
1215 // TCOORD (start to end) distribution
1216 fprintf(ofp, "# Distribution of aligned template coordinates.\n");
1217 fprintf(ofp, "# Use 'grep ^%cTCOORD | cut -f 2-' to extract this part.\n", type);
1218 for (r = 0; r < nref; r++) {
1219 if (!amps[r].sites)
1220 continue;
1221 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1222 for (i = 0 - (nref==1); i < amps[r].namp; i++) {
1223 if (ntcoord < kh_size(stats->tcoord[i+1])) {
1224 ntcoord = kh_size(stats->tcoord[i+1]);
1225 tcoord_t *tmp = realloc(tpos, ntcoord * sizeof(*tmp));
1226 if (!tmp) {
1227 free(tpos);
1228 return -1;
1229 }
1230 tpos = tmp;
1231 }
1232
1233 khiter_t k;
1234 size_t n = 0, j;
1235 for (k = kh_begin(stats->tcoord[i+1]);
1236 k != kh_end(stats->tcoord[i+1]); k++) {
1237 if (!kh_exist(stats->tcoord[i+1], k) ||
1238 (kh_value(stats->tcoord[i+1], k) & 0xFFFFFFFF) == 0)
1239 continue;
1240 // Key is start,end in 32-bit quantities.
1241 // Yes this limits us to 4Gb references, but just how
1242 // many primers are we planning on making? Not that many
1243 // I hope.
1244 tpos[n].start = kh_key(stats->tcoord[i+1], k)&0xffffffff;
1245 tpos[n].end = kh_key(stats->tcoord[i+1], k)>>32;
1246
1247 // Value is frequency (top 32-bits) and status (bottom 32).
1248 tpos[n].freq = kh_value(stats->tcoord[i+1], k)&0xffffffff;
1249 tpos[n].status = kh_value(stats->tcoord[i+1], k)>>32;
1250 n++;
1251 }
1252
1253 if (args->tcoord_bin > 1)
1254 aggregate_tcoord(args, tpos, &n);
1255
1256 fprintf(ofp, "%cTCOORD\t%s\t%d", type, name,
1257 i+1+amps[r].first_amp); // per amplicon
1258 for (j = 0; j < n; j++) {
1259 if (tpos[j].freq < args->tcoord_min_count)
1260 continue;
1261 fprintf(ofp, "\t%d,%d,%u,%u",
1262 tpos[j].start,
1263 tpos[j].end,
1264 tpos[j].freq,
1265 tpos[j].status);
1266 }
1267 fprintf(ofp, "\n");
1268 }
1269 }
1270
1271
1272 // AMP length distribution.
1273 // 0 = both ends in this amplicon
1274 // 1 = ends in different amplicons
1275 // 2 = other end matching an unknown amplicon site
1276 // (see tcoord for further analysis of where)
1277 fprintf(ofp, "# Classification of amplicon status. Columns are\n");
1278 fprintf(ofp, "# number with both primers from this amplicon, number with\n");
1279 fprintf(ofp, "# primers from different amplicon, and number with a position\n");
1280 fprintf(ofp, "# not matching any valid amplicon primer site\n");
1281 fprintf(ofp, "# Use 'grep ^%cAMP | cut -f 2-' to extract this part.\n", type);
1282
1283 fprintf(ofp, "%cAMP\t%s\t0", type, name); // all merged
1284 int amp_dist[3] = {0};
1285 for (r = 0; r < nref; r++) {
1286 if (!amps[r].sites)
1287 continue;
1288 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1289 for (i = 0; i < amps[r].namp; i++) { // accumulate for all amps
1290 amp_dist[0] += stats->amp_dist[i][0];
1291 amp_dist[1] += stats->amp_dist[i][1];
1292 amp_dist[2] += stats->amp_dist[i][2];
1293 }
1294 }
1295 fprintf(ofp, "\t%d\t%d\t%d\n", amp_dist[0], amp_dist[1], amp_dist[2]);
1296
1297 for (r = 0; r < nref; r++) {
1298 if (!amps[r].sites)
1299 continue;
1300 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1301 for (i = 0; i < amps[r].namp; i++) {
1302 // per amplicon
1303 fprintf(ofp, "%cAMP\t%s\t%d", type, name, i+1+amps[r].first_amp);
1304 fprintf(ofp, "\t%d\t%d\t%d\n", stats->amp_dist[i][0],
1305 stats->amp_dist[i][1], stats->amp_dist[i][2]);
1306 }
1307 }
1308
1309 free(tpos);
1310 return 0;
1311 }
1312
dump_lstats(astats_args_t * args,char type,char * name,int nfile,amplicons_t * amps,int nref)1313 int dump_lstats(astats_args_t *args, char type, char *name, int nfile,
1314 amplicons_t *amps, int nref) {
1315 return dump_stats(args, type, name, nfile, amps, nref, 1);
1316 }
1317
dump_gstats(astats_args_t * args,char type,char * name,int nfile,amplicons_t * amps,int nref)1318 int dump_gstats(astats_args_t *args, char type, char *name, int nfile,
1319 amplicons_t *amps, int nref) {
1320 return dump_stats(args, type, name, nfile, amps, nref, 0);
1321 }
1322
get_sample_name(sam_hdr_t * header,char * RG)1323 char const *get_sample_name(sam_hdr_t *header, char *RG) {
1324 kstring_t ks = {0};
1325 sam_hdr_find_tag_id(header, "RG", RG?"ID":NULL, RG, "SM", &ks);
1326 return ks.s;
1327 }
1328
1329 // Return maximum reference length (SQ is NULL) or the length
1330 // of the specified reference in SQ.
get_ref_len(sam_hdr_t * header,const char * SQ)1331 int64_t get_ref_len(sam_hdr_t *header, const char *SQ) {
1332 if (SQ) {
1333 int tid = SQ ? sam_hdr_name2tid(header, SQ) : 0;
1334 return tid >= 0 ? sam_hdr_tid2len(header, tid) : -1;
1335 } else {
1336 int nref = sam_hdr_nref(header), tid;;
1337 int64_t len = 0;
1338 for (tid = 0; tid < nref; tid++) {
1339 int64_t rl = sam_hdr_tid2len(header, tid);
1340 if (len < rl)
1341 len = rl;
1342 }
1343 return len;
1344 }
1345 }
1346
amplicon_stats(astats_args_t * args,khash_t (bed_list_hash)* bed_hash,char ** filev,int filec)1347 static int amplicon_stats(astats_args_t *args,
1348 khash_t(bed_list_hash) *bed_hash,
1349 char **filev, int filec) {
1350 int i, ref = -1, ref_tid = -1, ret = -1, nref = 0;
1351 samFile *fp = NULL;
1352 sam_hdr_t *header = NULL;
1353 bam1_t *b = bam_init1();
1354 FILE *ofp = args->out_fp;
1355 char sname_[8192], *sname = NULL;
1356 amplicons_t *amps = NULL;
1357
1358 // Report initial SS header. We gather data from the bed_hash entries
1359 // as well as from the first SAM header (with the requirement that all
1360 // headers should be compatible).
1361 if (filec) {
1362 if (!(fp = sam_open_format(filev[0], "r", &args->ga.in))) {
1363 print_error_errno("ampliconstats",
1364 "Cannot open input file \"%s\"",
1365 filev[0]);
1366 goto err;
1367 }
1368 if (!(header = sam_hdr_read(fp)))
1369 goto err;
1370
1371 if (!amps) {
1372 amps = calloc(nref=sam_hdr_nref(header), sizeof(*amps));
1373 if (!amps)
1374 goto err;
1375 fprintf(ofp, "# Summary statistics, used for scaling the plots.\n");
1376 fprintf(ofp, "SS\tSamtools version: %s\n", samtools_version());
1377 fprintf(ofp, "SS\tCommand line: %s\n", args->argv);
1378 fprintf(ofp, "SS\tNumber of files:\t%d\n", filec);
1379
1380 // Note: order of hash entries will be different to order of
1381 // BED file which may also differ to order of SQ headers.
1382 // SQ header is canonical ordering (pos sorted file).
1383 khiter_t k;
1384 int bam_nref = sam_hdr_nref(header);
1385 for (i = 0; i < bam_nref; i++) {
1386 k = kh_get(bed_list_hash, bed_hash,
1387 sam_hdr_tid2name(header, i));
1388 if (!kh_exist(bed_hash, k))
1389 continue;
1390
1391 bed_entry_list_t *sites = &kh_value(bed_hash, k);
1392
1393 ref = i;
1394 amps[ref].ref = kh_key(bed_hash, k);
1395 amps[ref].sites = sites;
1396 amps[ref].namp = count_amplicon(sites);
1397 amps[ref].amp = calloc(sites->length,
1398 sizeof(*amps[ref].amp));
1399 if (!amps[ref].amp)
1400 goto err;
1401 if (args->multi_ref)
1402 fprintf(ofp, "SS\tNumber of amplicons:\t%s\t%d\n",
1403 kh_key(bed_hash, k), amps[ref].namp);
1404 else
1405 fprintf(ofp, "SS\tNumber of amplicons:\t%d\n",
1406 amps[ref].namp);
1407
1408 amps[ref].tid = ref;
1409 if (ref_tid == -1)
1410 ref_tid = ref;
1411
1412 int64_t len = get_ref_len(header, kh_key(bed_hash, k));
1413 amps[ref].len = len;
1414 if (args->multi_ref)
1415 fprintf(ofp, "SS\tReference length:\t%s\t%"PRId64"\n",
1416 kh_key(bed_hash, k), len);
1417 else
1418 fprintf(ofp, "SS\tReference length:\t%"PRId64"\n",
1419 len);
1420
1421 amps[ref].lstats = stats_alloc(len, args->max_amp,
1422 args->max_amp_len);
1423 amps[ref].gstats = stats_alloc(len, args->max_amp,
1424 args->max_amp_len);
1425 if (!amps[ref].lstats || !amps[ref].gstats)
1426 goto err;
1427 }
1428 }
1429
1430 sam_hdr_destroy(header);
1431 header = NULL;
1432 if (sam_close(fp) < 0) {
1433 fp = NULL;
1434 goto err;
1435 }
1436 fp = NULL;
1437 }
1438 fprintf(ofp, "SS\tEnd of summary\n");
1439
1440 // Extract the bits of amplicon data we need from bed hash and turn
1441 // it into a position-to-amplicon lookup table.
1442 int offset = 0;
1443 for (i = 0; i < nref; i++) {
1444 if (!amps[i].sites)
1445 continue;
1446
1447 amps[i].first_amp = offset;
1448 if (bed2amplicon(args, amps[i].sites, amps[i].amp,
1449 &s[i].namp, i==0, amps[i].ref, offset) < 0)
1450 goto err;
1451
1452 offset += amps[i].namp; // cumulative amplicon number across refs
1453 }
1454
1455 // Now iterate over file contents, one at a time.
1456 for (i = 0; i < filec; i++) {
1457 char *nstart = filev[i];
1458
1459 fp = sam_open_format(filev[i], "r", &args->ga.in);
1460 if (!fp) {
1461 print_error_errno("ampliconstats",
1462 "Cannot open input file \"%s\"",
1463 filev[i]);
1464 goto err;
1465 }
1466
1467 if (args->ga.nthreads > 0)
1468 hts_set_threads(fp, args->ga.nthreads);
1469
1470 if (!(header = sam_hdr_read(fp)))
1471 goto err;
1472
1473 if (nref != sam_hdr_nref(header)) {
1474 print_error_errno("ampliconstats",
1475 "SAM headers are not consistent across input files");
1476 goto err;
1477 }
1478 int r;
1479 for (r = 0; r < nref; r++) {
1480 if (!amps[r].ref ||
1481 strcmp(amps[r].ref, sam_hdr_tid2name(header, r)) != 0 ||
1482 amps[r].len != sam_hdr_tid2len(header, r)) {
1483 print_error_errno("ampliconstats",
1484 "SAM headers are not consistent across "
1485 "input files");
1486 goto err;
1487 }
1488 }
1489
1490 if (args->use_sample_name)
1491 sname = (char *)get_sample_name(header, NULL);
1492
1493 if (!sname) {
1494 sname = sname_;
1495 char *nend = filev[i] + strlen(filev[i]), *cp;
1496 if ((cp = strrchr(filev[i], '/')))
1497 nstart = cp+1;
1498 if ((cp = strrchr(nstart, '.')) &&
1499 (strcmp(cp, ".bam") == 0 ||
1500 strcmp(cp, ".sam") == 0 ||
1501 strcmp(cp, ".cram") == 0))
1502 nend = cp;
1503 if (nend - nstart >= 8192) nend = nstart+8191;
1504 memcpy(sname, nstart, nend-nstart);
1505 sname[nend-nstart] = 0;
1506 }
1507
1508 // Stats local to this sample only
1509 amp_stats_reset(amps, nref);
1510
1511 int last_ref = -9;
1512 while ((r = sam_read1(fp, header, b)) >= 0) {
1513 // Other filter options useful here?
1514 if (b->core.tid < 0)
1515 continue;
1516
1517 if (last_ref != b->core.tid) {
1518 last_ref = b->core.tid;
1519 if (initialise_amp_pos_lookup(args, amps, last_ref) < 0)
1520 goto err;
1521 }
1522
1523 if (accumulate_stats(args, amps, b) < 0)
1524 goto err;
1525 }
1526
1527 if (r < -1) {
1528 print_error_errno("ampliconstats", "Fail reading record");
1529 goto err;
1530 }
1531
1532 sam_hdr_destroy(header);
1533 if (sam_close(fp) < 0) {
1534 fp = NULL;
1535 goto err;
1536 }
1537
1538 fp = NULL;
1539 header = NULL;
1540
1541 if (dump_lstats(args, 'F', sname, filec, amps, nref) < 0)
1542 goto err;
1543
1544 if (append_stats(amps, nref) < 0)
1545 goto err;
1546
1547 if (sname && sname != sname_)
1548 free(sname);
1549 sname = NULL;
1550 }
1551
1552 if (dump_gstats(args, 'C', "COMBINED", filec, amps, nref) < 0)
1553 goto err;
1554
1555 ret = 0;
1556 err:
1557 bam_destroy1(b);
1558 if (ret) {
1559 if (header)
1560 sam_hdr_destroy(header);
1561 if (fp)
1562 sam_close(fp);
1563 }
1564 for (i = 0; i < nref; i++) {
1565 stats_free(amps[i].lstats);
1566 stats_free(amps[i].gstats);
1567 free(amps[i].amp);
1568 }
1569 free(amps);
1570 free(pos2start);
1571 free(pos2end);
1572 if (ret) {
1573 if (sname && sname != sname_)
1574 free(sname);
1575 }
1576
1577 return ret;
1578 }
1579
usage(astats_args_t * args,FILE * fp,int exit_status)1580 static int usage(astats_args_t *args, FILE *fp, int exit_status) {
1581 fprintf(fp,
1582 "\n"
1583 "Usage: samtools ampliconstats [options] primers.bed *.bam > astats.txt\n"
1584 "\n"
1585 "Options:\n");
1586 fprintf(fp, " -f, --required-flag STR|INT\n"
1587 " Only include reads with all of the FLAGs present [0x%X]\n",args->flag_require);
1588 fprintf(fp, " -F, --filter-flag STR|INT\n"
1589 " Only include reads with none of the FLAGs present [0x%X]\n",args->flag_filter & 0xffff);
1590 fprintf(fp, " -a, --max-amplicons INT\n"
1591 " Change the maximum number of amplicons permitted [%d]\n", MAX_AMP);
1592 fprintf(fp, " -l, --max-amplicon-length INT\n"
1593 " Change the maximum length of an individual amplicon [%d]\n", MAX_AMP_LEN);
1594 fprintf(fp, " -d, --min-depth INT[,INT]...\n"
1595 " Minimum base depth(s) to consider position covered [%d]\n", args->min_depth[0]);
1596 fprintf(fp, " -m, --pos-margin INT\n"
1597 " Margin of error for matching primer positions [%d]\n", args->max_delta);
1598 fprintf(fp, " -o, --output FILE\n"
1599 " Specify output file [samtools_stdout if unset]\n");
1600 fprintf(fp, " -s, --use-sample-name\n"
1601 " Use the sample name from the first @RG header line\n");
1602 fprintf(fp, " -t, --tlen-adjust INT\n"
1603 " Add/subtract from TLEN; use when clipping but no fixmate step\n");
1604 fprintf(fp, " -b, --tcoord-bin INT\n"
1605 " Bin template start,end positions into multiples of INT[1]\n");
1606 fprintf(fp, " -c, --tcoord-min-count INT\n"
1607 " Minimum template start,end frequency for recording [%d]\n", TCOORD_MIN_COUNT);
1608 fprintf(fp, " -D, --depth-bin FRACTION\n"
1609 " Merge FDP values within +/- FRACTION together\n");
1610 fprintf(fp, " -S, --single-ref\n"
1611 " Force single-ref (<=1.12) output format\n");
1612 sam_global_opt_help(fp, "I.--.@");
1613
1614 return exit_status;
1615 }
1616
main_ampliconstats(int argc,char ** argv)1617 int main_ampliconstats(int argc, char **argv) {
1618 astats_args_t args = {
1619 .ga = SAM_GLOBAL_ARGS_INIT,
1620 .flag_require = 0,
1621 .flag_filter = 0x10B04,
1622 //.sites = BED_LIST_INIT,
1623 .max_delta = 30, // large enough to cope with alt primers
1624 .min_depth = {1},
1625 .use_sample_name = 0,
1626 .max_amp = MAX_AMP,
1627 .max_amp_len = MAX_AMP_LEN,
1628 .tlen_adj = 0,
1629 .out_fp = samtools_stdout,
1630 .tcoord_min_count = TCOORD_MIN_COUNT,
1631 .tcoord_bin = 1,
1632 .depth_bin = 0.01,
1633 .multi_ref = 1
1634 }, oargs = args;
1635
1636 static const struct option loptions[] =
1637 {
1638 SAM_OPT_GLOBAL_OPTIONS('I', 0, '-', '-', 0, '@'),
1639 {"help", no_argument, NULL, 'h'},
1640 {"flag-require", required_argument, NULL, 'f'},
1641 {"flag-filter", required_argument, NULL, 'F'},
1642 {"min-depth", required_argument, NULL, 'd'},
1643 {"output", required_argument, NULL, 'o'},
1644 {"pos-margin", required_argument, NULL, 'm'},
1645 {"use-sample-name", no_argument, NULL, 's'},
1646 {"max-amplicons", required_argument, NULL, 'a'},
1647 {"max-amplicon-length", required_argument, NULL, 'l'},
1648 {"tlen-adjust", required_argument, NULL, 't'},
1649 {"tcoord-min-count", required_argument, NULL, 'c'},
1650 {"tcoord-bin", required_argument, NULL, 'b'},
1651 {"depth-bin", required_argument, NULL, 'D'},
1652 {"single-ref", no_argument, NULL, 'S'},
1653 {NULL, 0, NULL, 0}
1654 };
1655 int opt;
1656
1657 while ( (opt=getopt_long(argc,argv,"?hf:F:@:p:m:d:sa:l:t:o:c:b:D:S",loptions,NULL))>0 ) {
1658 switch (opt) {
1659 case 'f': args.flag_require = bam_str2flag(optarg); break;
1660 case 'F':
1661 if (args.flag_filter & 0x10000)
1662 args.flag_filter = 0; // strip default on first -F usage
1663 args.flag_filter |= bam_str2flag(optarg); break;
1664
1665 case 'm': args.max_delta = atoi(optarg); break; // margin
1666 case 'D': args.depth_bin = atof(optarg); break; // depth bin fraction
1667 case 'd': {
1668 int d = 0;
1669 char *cp = optarg, *ep;
1670 do {
1671 long n = strtol(cp, &ep, 10);
1672 args.min_depth[d++] = n;
1673 if (*ep != ',')
1674 break;
1675 cp = ep+1;
1676 } while (d < MAX_DEPTH);
1677 break;
1678 }
1679
1680 case 'a': args.max_amp = atoi(optarg)+1;break;
1681 case 'l': args.max_amp_len = atoi(optarg)+1;break;
1682
1683 case 'c': args.tcoord_min_count = atoi(optarg);break;
1684 case 'b':
1685 args.tcoord_bin = atoi(optarg);
1686 if (args.tcoord_bin < 1)
1687 args.tcoord_bin = 1;
1688 break;
1689
1690 case 't': args.tlen_adj = atoi(optarg);break;
1691
1692 case 's': args.use_sample_name = 1;break;
1693
1694 case 'o':
1695 if (!(args.out_fp = fopen(optarg, "w"))) {
1696 perror(optarg);
1697 return 1;
1698 }
1699 break;
1700
1701 case 'S':
1702 args.multi_ref = 0;
1703 break;
1704
1705 case '?': return usage(&oargs, samtools_stderr, EXIT_FAILURE);
1706 case 'h': return usage(&oargs, samtools_stdout, EXIT_SUCCESS);
1707
1708 default:
1709 if (parse_sam_global_opt(opt, optarg, loptions, &args.ga) != 0)
1710 usage(&oargs,samtools_stderr, EXIT_FAILURE);
1711 break;
1712 }
1713 }
1714
1715 if (argc <= optind)
1716 return usage(&oargs, samtools_stdout, EXIT_SUCCESS);
1717 if (argc <= optind+1 && isatty(STDIN_FILENO))
1718 return usage(&oargs, samtools_stderr, EXIT_FAILURE);
1719
1720 khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
1721 if (load_bed_file_multi_ref(argv[optind], 1, 0, bed_hash)) {
1722 print_error_errno("ampliconstats",
1723 "Could not read file \"%s\"", argv[optind]);
1724 return 1;
1725
1726 }
1727
1728 khiter_t k, ref_count = 0;
1729 for (k = kh_begin(bed_hash); k != kh_end(bed_hash); k++) {
1730 if (!kh_exist(bed_hash, k))
1731 continue;
1732 ref_count++;
1733 }
1734 if (ref_count == 0)
1735 return 1;
1736 if (ref_count > 1 && args.multi_ref == 0) {
1737 print_error("ampliconstats",
1738 "Single-ref mode is not permitted for BED files\n"
1739 "containing more than one reference.");
1740 return 1;
1741 }
1742
1743 args.argv = stringify_argv(argc, argv);
1744 int ret;
1745 if (argc == ++optind) {
1746 char *av = "-";
1747 ret = amplicon_stats(&args, bed_hash, &av, 1);
1748 } else {
1749 ret = amplicon_stats(&args, bed_hash, &argv[optind], argc-optind);
1750 }
1751
1752 free(args.argv);
1753 destroy_bed_hash(bed_hash);
1754
1755 return ret;
1756 }
1757