1 /*  stats.c -- This is the former bamcheck integrated into samtools/htslib.
2 
3     Copyright (C) 2020-2021 Genome Research Ltd.
4 
5     Author: James Bonfield <jkb@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE.  */
24 
25 /*
26  * This tool is designed to give "samtools stats" style output, but dedicated
27  * to small amplicon sequencing projects.  It gathers stats on the
28  * distribution of reads across amplicons.
29  */
30 
31 /*
32  * TODO:
33  * - Cope with multiple references.  What do we do here?  Just request one?
34  * - Permit regions rather than consuming whole file (maybe solves above).
35  */
36 
37 #include <config.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <inttypes.h>
42 #include <getopt.h>
43 #include <unistd.h>
44 #include <math.h>
45 
46 #include <htslib/sam.h>
47 #include <htslib/khash.h>
48 
49 #include "samtools.h"
50 #include "sam_opts.h"
51 #include "bam_ampliconclip.h"
52 
53 KHASH_MAP_INIT_INT64(tcoord, int64_t)
54 KHASH_MAP_INIT_STR(qname, int64_t)
55 
56 #ifndef MIN
57 #define MIN(a,b) ((a)<(b)?(a):(b))
58 #endif
59 
60 #ifndef MAX
61 #define MAX(a,b) ((a)>(b)?(a):(b))
62 #endif
63 
64 #ifndef ABS
65 #define ABS(a) ((a)>=0?(a):-(a))
66 #endif
67 
68 #define TCOORD_MIN_COUNT   10
69 #define MAX_AMP 1000       // Default maximum number of amplicons
70 #define MAX_AMP_LEN 1000   // Default maximum length of any single amplicon
71 #define MAX_PRIMER_PER_AMPLICON 4  // Max primers per LEFT/RIGHT
72 #define MAX_DEPTH 5        // Number of different depths permitted
73 
74 typedef struct {
75     sam_global_args ga;
76     uint32_t flag_require;
77     uint32_t flag_filter;
78     int max_delta;   // Used for matching read to amplicon primer loc
79     int min_depth[MAX_DEPTH]; // Used for coverage; must be >= min_depth deep
80     int use_sample_name;
81     int max_amp;     // Total number of amplicons
82     int max_amp_len; // Maximum length of an individual amplicon
83     double depth_bin;// aggregate depth within this fraction
84     int tlen_adj;    // Adjust tlen by this amount, due to clip but no fixmate
85     FILE *out_fp;
86     char *argv;
87     int tcoord_min_count;
88     int tcoord_bin;
89     int multi_ref;
90 } astats_args_t;
91 
92 typedef struct {
93     int nseq;       // total sequence count
94     int nfiltered;  // sequence filtered
95     int nfailprimer;// count of sequences not matching the primer locations
96 
97     // Sizes of memory allocated below, to permit reset
98     int max_amp, max_amp_len, max_len;
99 
100     // Summary across all samples, sum(x) plus sum(x^2) for s.d. calc
101     int64_t *nreads, *nreads2;          // [max_amp]
102     double  *nfull_reads;               // [max_amp]; 0.5/read if paired.
103     double  *nrperc, *nrperc2;          // [max_amp]
104     int64_t *nbases, *nbases2;          // [max_amp]
105     int64_t *coverage;                  // [max_amp][max_amp_len]
106     double  (*covered_perc)[MAX_DEPTH]; // [max_amp][MAX_DEPTH]
107     double  (*covered_perc2)[MAX_DEPTH];// [max_amp][MAX_DEPTH];
108     khash_t(tcoord) **tcoord;           // [max_amp+1]
109 
110     // 0 is correct pair, 1 is incorrect pair, 2 is unidentified
111     int     (*amp_dist)[3];             // [MAX_AMP][3];
112 
113     int *depth_valid; // [max_len]
114     int *depth_all;   // [max_len]
115     khash_t(qname) *qend;  // queryname end, for overlap removal
116 } astats_t;
117 
118 // We can have multiple primers for LEFT / RIGHT, so this
119 // permits detection by any compatible combination.
120 // One reference:
121 typedef struct {
122     int64_t left[MAX_PRIMER_PER_AMPLICON];
123     int nleft;
124     int64_t right[MAX_PRIMER_PER_AMPLICON];
125     int nright;
126     int64_t max_left, min_right; // inner dimensions
127     int64_t min_left, max_right; // outer dimensions
128 } amplicon_t;
129 
130 // Multiple references, we have an array of amplicons_t - one per used ref.
131 // We have per reference local and global stats here, as some of the stats
132 // are coordinate based.  However we report them combined together as a single
133 // list across all references.
134 // "namp" is the number of amplicons in this reference, but they're
135 // numbered first_amp to first_amp+namp-1 inclusively.
136 typedef struct {
137     int tid, namp;
138     int64_t len;
139     bed_entry_list_t *sites;
140     amplicon_t *amp;
141     astats_t *lstats, *gstats; // local (1 file) and global (all file) stats
142     const char *ref;           // ref name (pointer to the bed hash table key)
143     int first_amp;             // first amplicon number for this ref
144 } amplicons_t;
145 
146 // Reinitialised for each new reference/chromosome.
147 // Counts from 1 to namp, -1 for no match and 0 for ?.
148 static int *pos2start = NULL;
149 static int *pos2end = NULL;
150 static int pos2size = 0; // allocated size of pos2start/end
151 
152 // Lookup table to go from position to amplicon based on
153 // read start / end.
initialise_amp_pos_lookup(astats_args_t * args,amplicons_t * amps,int ref)154 static int initialise_amp_pos_lookup(astats_args_t *args,
155                                      amplicons_t *amps,
156                                      int ref) {
157     int64_t i, j;
158     amplicon_t *amp = amps[ref].amp;
159     int64_t max_len = amps[ref].len;
160     int namp = amps[ref].namp;
161 
162     if (max_len+1 > pos2size) {
163         if (!(pos2start = realloc(pos2start, (max_len+1)*sizeof(*pos2start))))
164             return -1;
165         if (!(pos2end   = realloc(pos2end,   (max_len+1)*sizeof(*pos2end))))
166             return -1;
167         pos2size = max_len;
168     }
169     for (i = 0; i < max_len; i++)
170         pos2start[i] = pos2end[i] = -1;
171 
172     for (i = 0; i < namp; i++) {
173         for (j = 0; j < amp[i].nleft; j++) {
174             int64_t p;
175             for (p = amp[i].left[j] - args->max_delta;
176                  p <= amp[i].left[j] + args->max_delta; p++) {
177                 if (p < 1 || p > max_len)
178                     continue;
179                 pos2start[p-1] = i;
180             }
181         }
182         for (j = 0; j < amp[i].nright; j++) {
183             int64_t p;
184             for (p = amp[i].right[j] - args->max_delta;
185                  p <= amp[i].right[j] + args->max_delta; p++) {
186                 if (p < 1 || p > max_len)
187                     continue;
188                 pos2end[p-1] = i;
189             }
190         }
191     }
192 
193     return 0;
194 }
195 
196 // Counts amplicons.
197 // Assumption: input BED file alternates between LEFT and RIGHT primers
198 // per amplicon, thus we can count the number based on the switching
199 // orientation.
count_amplicon(bed_entry_list_t * sites)200 static int count_amplicon(bed_entry_list_t *sites) {
201     int i, namp, last_rev = 0;
202     for (i = namp = 0; i < sites->length; i++) {
203         if (sites->bp[i].rev == 0 && last_rev)
204             namp++;
205         last_rev = sites->bp[i].rev;
206     }
207 
208     return ++namp;
209 }
210 
211 // We're only interest in the internal part of the amplicon.
212 // Our bed file has LEFT start/end followed by RIGHT start/end,
213 // so collapse these to LEFT end / RIGHT start.
214 //
215 // Returns right most amplicon position on success,
216 //         < 0 on error
bed2amplicon(astats_args_t * args,bed_entry_list_t * sites,amplicon_t * amp,int * namp,int do_title,const char * ref,int first_amp)217 static int64_t bed2amplicon(astats_args_t *args, bed_entry_list_t *sites,
218                             amplicon_t *amp, int *namp, int do_title,
219                             const char *ref, int first_amp) {
220     int i, j;
221     int64_t max_right = 0;
222     FILE *ofp = args->out_fp;
223 
224     *namp = 0;
225 
226     // Assume all primers for the same amplicon are adjacent in BED
227     // with all + followed by all -.  Thus - to + signifies next primer set.
228     int last_rev = 0;
229     amp[0].max_left = 0;
230     amp[0].min_right = INT64_MAX;
231     amp[0].min_left = INT64_MAX;
232     amp[0].max_right = 0;
233     if (do_title) {
234         fprintf(ofp, "# Amplicon locations from BED file.\n");
235         fprintf(ofp, "# LEFT/RIGHT are <start>-<end> format and "
236                 "comma-separated for alt-primers.\n");
237         if (args->multi_ref)
238             fprintf(ofp, "#\n# AMPLICON\tREF\tNUMBER\tLEFT\tRIGHT\n");
239         else
240             fprintf(ofp, "#\n# AMPLICON\tNUMBER\tLEFT\tRIGHT\n");
241     }
242     for (i = j = 0; i < sites->length; i++) {
243         if (i == 0 && sites->bp[i].rev != 0) {
244             fprintf(stderr, "[ampliconstats] error: BED file should start"
245                     " with the + strand primer\n");
246             return -1;
247         }
248         if (sites->bp[i].rev == 0 && last_rev) {
249             j++;
250             if (j >= args->max_amp) {
251                 fprintf(stderr, "[ampliconstats] error: too many amplicons"
252                         " (%d). Use -a option to raise this.\n", j);
253                 return -1;
254             }
255             amp[j].max_left = 0;
256             amp[j].min_right = INT64_MAX;
257             amp[j].min_left = INT64_MAX;
258             amp[j].max_right = 0;
259         }
260         if (sites->bp[i].rev == 0) {
261             if (i == 0 || last_rev) {
262                 if (j>0) fprintf(ofp, "\n");
263                 if (args->multi_ref)
264                     fprintf(ofp, "AMPLICON\t%s\t%d", ref, j+1 + first_amp);
265                 else
266                     fprintf(ofp, "AMPLICON\t%d", j+1);
267             }
268             if (amp[j].nleft >= MAX_PRIMER_PER_AMPLICON) {
269                 print_error_errno("ampliconstats",
270                                   "too many primers per amplicon (%d).\n",
271                                   MAX_PRIMER_PER_AMPLICON);
272                 return -1;
273             }
274             amp[j].left[amp[j].nleft++] = sites->bp[i].right;
275             if (amp[j].max_left < sites->bp[i].right+1)
276                 amp[j].max_left = sites->bp[i].right+1;
277             if (amp[j].min_left > sites->bp[i].right+1)
278                 amp[j].min_left = sites->bp[i].right+1;
279             // BED file, so left+1 as zero based. right(+1-1) as
280             // BED goes one beyond end (and we want inclusive range).
281             fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nleft > 1],
282                     sites->bp[i].left+1, sites->bp[i].right);
283         } else {
284             if (amp[j].nright >= MAX_PRIMER_PER_AMPLICON) {
285                 print_error_errno("ampliconstats",
286                                   "too many primers per amplicon (%d)",
287                                   MAX_PRIMER_PER_AMPLICON);
288                 return -1;
289             }
290             amp[j].right[amp[j].nright++] = sites->bp[i].left;
291             if (amp[j].min_right > sites->bp[i].left-1)
292                 amp[j].min_right = sites->bp[i].left-1;
293             if (amp[j].max_right < sites->bp[i].left-1) {
294                 amp[j].max_right = sites->bp[i].left-1;
295                 if (amp[j].max_right - amp[j].min_left + 1 >=
296                     args->max_amp_len) {
297                     fprintf(stderr, "[ampliconstats] error: amplicon "
298                             "longer (%d) than max_amp_len option (%d)\n",
299                             (int)(amp[j].max_right - amp[j].min_left + 2),
300                             args->max_amp_len);
301                     return -1;
302                 }
303                 if (max_right < amp[j].max_right)
304                     max_right = amp[j].max_right;
305             }
306             fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nright > 1],
307                     sites->bp[i].left+1, sites->bp[i].right);
308         }
309         last_rev = sites->bp[i].rev;
310     }
311     if (last_rev != 1) {
312         fprintf(ofp, "\n"); // useful if going to stdout
313         fprintf(stderr, "[ampliconstats] error: bed file does not end on"
314                 " a reverse strand primer.\n");
315         return -1;
316     }
317     *namp = ++j;
318     if (j) fprintf(ofp, "\n");
319 
320     if (j >= args->max_amp) {
321         fprintf(stderr, "[ampliconstats] error: "
322                 "too many amplicons (%d). Use -a option to raise this.", j);
323         return -1;
324     }
325 
326 //    for (i = 0; i < *namp; i++) {
327 //      printf("%d\t%ld", i, amp[i].length);
328 //      for (j = 0; j < amp[i].nleft; j++)
329 //          printf("%c%ld", "\t,"[j>0], amp[i].left[j]);
330 //      for (j = 0; j < amp[i].nright; j++)
331 //          printf("%c%ld", "\t,"[j>0], amp[i].right[j]);
332 //      printf("\n");
333 //    }
334 
335     return max_right;
336 }
337 
stats_free(astats_t * st)338 void stats_free(astats_t *st) {
339     if (!st)
340         return;
341 
342     free(st->nreads);
343     free(st->nreads2);
344     free(st->nfull_reads);
345     free(st->nrperc);
346     free(st->nrperc2);
347     free(st->nbases);
348     free(st->nbases2);
349     free(st->coverage);
350     free(st->covered_perc);
351     free(st->covered_perc2);
352     free(st->amp_dist);
353 
354     free(st->depth_valid);
355     free(st->depth_all);
356 
357     if (st->tcoord) {
358         int i;
359         for (i = 0; i <= st->max_amp; i++) {
360             if (st->tcoord[i])
361                 kh_destroy(tcoord, st->tcoord[i]);
362         }
363         free(st->tcoord);
364     }
365 
366     khiter_t k;
367     for (k = kh_begin(st->qend); k != kh_end(st->qend); k++)
368         if (kh_exist(st->qend, k))
369             free((void *)kh_key(st->qend, k));
370     kh_destroy(qname, st->qend);
371 
372     free(st);
373 }
374 
stats_alloc(int64_t max_len,int max_amp,int max_amp_len)375 astats_t *stats_alloc(int64_t max_len, int max_amp, int max_amp_len) {
376     astats_t *st = calloc(1, sizeof(*st));
377     if (!st)
378         return NULL;
379 
380     st->max_amp = max_amp;
381     st->max_amp_len = max_amp_len;
382     st->max_len = max_len;
383 
384     if (!(st->nreads  = calloc(max_amp, sizeof(*st->nreads))))  goto err;
385     if (!(st->nreads2 = calloc(max_amp, sizeof(*st->nreads2)))) goto err;
386     if (!(st->nrperc  = calloc(max_amp, sizeof(*st->nrperc))))  goto err;
387     if (!(st->nrperc2 = calloc(max_amp, sizeof(*st->nrperc2)))) goto err;
388     if (!(st->nbases  = calloc(max_amp, sizeof(*st->nbases))))  goto err;
389     if (!(st->nbases2 = calloc(max_amp, sizeof(*st->nbases2)))) goto err;
390 
391     if (!(st->nfull_reads = calloc(max_amp, sizeof(*st->nfull_reads))))
392         goto err;
393 
394     if (!(st->coverage = calloc(max_amp*max_amp_len, sizeof(*st->coverage))))
395         goto err;
396 
397     if (!(st->covered_perc  = calloc(max_amp, sizeof(*st->covered_perc))))
398         goto err;
399     if (!(st->covered_perc2 = calloc(max_amp, sizeof(*st->covered_perc2))))
400         goto err;
401 
402     if (!(st->tcoord = calloc(max_amp+1, sizeof(*st->tcoord)))) goto err;
403     int i;
404     for (i = 0; i <= st->max_amp; i++)
405         if (!(st->tcoord[i] = kh_init(tcoord)))
406             goto err;
407 
408     if (!(st->qend = kh_init(qname)))
409         goto err;
410 
411     if (!(st->depth_valid = calloc(max_len, sizeof(*st->depth_valid))))
412         goto err;
413     if (!(st->depth_all   = calloc(max_len, sizeof(*st->depth_all))))
414         goto err;
415 
416     if (!(st->amp_dist  = calloc(max_amp, sizeof(*st->amp_dist))))  goto err;
417 
418     return st;
419 
420  err:
421     stats_free(st);
422     return NULL;
423 }
424 
stats_reset(astats_t * st)425 static void stats_reset(astats_t *st) {
426     st->nseq = 0;
427     st->nfiltered = 0;
428     st->nfailprimer = 0;
429 
430     memset(st->nreads,  0, st->max_amp * sizeof(*st->nreads));
431     memset(st->nreads2, 0, st->max_amp * sizeof(*st->nreads2));
432     memset(st->nfull_reads, 0, st->max_amp * sizeof(*st->nfull_reads));
433 
434     memset(st->nrperc,  0, st->max_amp * sizeof(*st->nrperc));
435     memset(st->nrperc2, 0, st->max_amp * sizeof(*st->nrperc2));
436 
437     memset(st->nbases,  0, st->max_amp * sizeof(*st->nbases));
438     memset(st->nbases2, 0, st->max_amp * sizeof(*st->nbases2));
439 
440     memset(st->coverage, 0, st->max_amp * st->max_amp_len
441            * sizeof(*st->coverage));
442     memset(st->covered_perc,  0, st->max_amp * sizeof(*st->covered_perc));
443     memset(st->covered_perc2, 0, st->max_amp * sizeof(*st->covered_perc2));
444 
445     // Keep the allocated entries as it's likely all files will share
446     // the same keys.  Instead we reset counters to zero for common ones
447     // and delete rare ones.
448     int i;
449     for (i = 0; i <= st->max_amp; i++) {
450         khiter_t k;
451         for (k = kh_begin(st->tcoord[i]);
452              k != kh_end(st->tcoord[i]); k++)
453             if (kh_exist(st->tcoord[i], k)) {
454                 if (kh_value(st->tcoord[i], k) < 5)
455                     kh_del(tcoord, st->tcoord[i], k);
456                 else
457                     kh_value(st->tcoord[i], k) = 0;
458             }
459     }
460 
461     khiter_t k;
462     for (k = kh_begin(st->qend); k != kh_end(st->qend); k++)
463         if (kh_exist(st->qend, k))
464             free((void *)kh_key(st->qend, k));
465     kh_clear(qname, st->qend);
466 
467     memset(st->depth_valid, 0, st->max_len * sizeof(*st->depth_valid));
468     memset(st->depth_all,   0, st->max_len * sizeof(*st->depth_all));
469     memset(st->amp_dist,  0, st->max_amp * sizeof(*st->amp_dist));
470 }
471 
amp_stats_reset(amplicons_t * amps,int nref)472 static void amp_stats_reset(amplicons_t *amps, int nref) {
473     int i;
474     for (i = 0; i < nref; i++) {
475         if (!amps[i].sites)
476             continue;
477         stats_reset(amps[i].lstats);
478     }
479 }
480 
accumulate_stats(astats_args_t * args,amplicons_t * amps,bam1_t * b)481 static int accumulate_stats(astats_args_t *args, amplicons_t *amps,
482                             bam1_t *b) {
483     int ref = b->core.tid;
484     amplicon_t *amp = amps[ref].amp;
485     astats_t *stats = amps[ref].lstats;
486     int len = amps[ref].len;
487 
488     if (!stats)
489         return 0;
490 
491     stats->nseq++;
492     if ((b->core.flag & args->flag_require) != args->flag_require ||
493         (b->core.flag & args->flag_filter)  != 0) {
494         stats->nfiltered++;
495         return 0;
496     }
497 
498     int64_t start = b->core.pos, mstart = start; // modified start
499     int64_t end = bam_endpos(b), i;
500 
501     // Compute all-template-depth and valid-template-depth.
502     // We track current end location per read name so we can remove overlaps.
503     // Potentially we could use this data for a better amplicon-depth
504     // count too, but for now it's purely for the per-base plots.
505     int ret;
506     khiter_t k;
507     int prev_start = 0, prev_end = 0;
508     if ((b->core.flag & BAM_FPAIRED)
509         && !(b->core.flag & (BAM_FSUPPLEMENTARY | BAM_FSECONDARY))) {
510         k = kh_put(qname, stats->qend, bam_get_qname(b), &ret);
511         if (ret == 0) {
512             prev_start = kh_value(stats->qend, k) & 0xffffffff;
513             prev_end = kh_value(stats->qend, k)>>32;
514             mstart = MAX(mstart, prev_end);
515             // Ideally we'd reuse strings so we don't thrash free/malloc.
516             // However let's see if the official way of doing that (malloc
517             // itself) is fast enough first.
518             free((void *)kh_key(stats->qend, k));
519             kh_del(qname, stats->qend, k);
520             //fprintf(stderr, "remove overlap %d to %d\n", (int)start, (int)mstart);
521         } else {
522             if (!(kh_key(stats->qend, k) = strdup(bam_get_qname(b))))
523                 return -1;
524 
525             kh_value(stats->qend, k) = start | (end << 32);
526         }
527     }
528     for (i = mstart; i < end && i < len; i++)
529         stats->depth_all[i]++;
530     if (i < end) {
531         print_error("ampliconstats", "record %s overhangs end of reference",
532                     bam_get_qname(b));
533         // But keep going, as it's harmless.
534     }
535 
536     // On single ended runs, eg ONT or PacBio, we just use the start/end
537     // of the template to assign.
538     int anum = (b->core.flag & BAM_FREVERSE) || !(b->core.flag & BAM_FPAIRED)
539         ? (end-1 >= 0 && end-1 < len ? pos2end[end-1] : -1)
540         : (start >= 0 && start < len ? pos2start[start] : -1);
541 
542     // ivar sometimes soft-clips 100% of the bases.
543     // This is essentially unmapped
544     if (end == start && (args->flag_filter & BAM_FUNMAP)) {
545         stats->nfiltered++;
546         return 0;
547     }
548 
549     if (anum == -1)
550         stats->nfailprimer++;
551 
552     if (anum >= 0) {
553         int64_t c = MIN(end,amp[anum].min_right+1) - MAX(start,amp[anum].max_left);
554         if (c > 0) {
555             stats->nreads[anum]++;
556             // NB: ref bases rather than read bases
557             stats->nbases[anum] += c;
558 
559             int64_t i;
560             if (start < 0) start = 0;
561             if (end > len) end = len;
562 
563             int64_t ostart = MAX(start, amp[anum].min_left-1);
564             int64_t oend = MIN(end, amp[anum].max_right);
565             int64_t offset = amp[anum].min_left-1;
566             for (i = ostart; i < oend; i++)
567                 stats->coverage[anum*stats->max_amp_len + i-offset]++;
568         } else {
569             stats->nfailprimer++;
570         }
571     }
572 
573     // Template length in terms of amplicon number to amplicon number.
574     // We expect left to right of same amplicon (len 0), but it may go
575     // to next amplicon (len 1) or prev (len -1), etc.
576     int64_t t_end;
577     int oth_anum = -1;
578 
579     if (b->core.flag & BAM_FPAIRED) {
580         t_end = (b->core.flag & BAM_FREVERSE ? end : start)
581             + b->core.isize;
582 
583         // If we've clipped the primers but not followed up with a fixmates
584         // then our start+TLEN will take us to a location which is
585         // length(LEFT_PRIMER) + length(RIGHT_PRIMER) too far away.
586         //
587         // The correct solution is to run samtools fixmate so TLEN is correct.
588         // The hacky solution is to fudge the expected tlen by double the
589         // average primer length (e.g. 50).
590         t_end += b->core.isize > 0 ? -args->tlen_adj : +args->tlen_adj;
591 
592         if (t_end > 0 && t_end < len && b->core.isize != 0)
593             oth_anum = (b->core.flag & BAM_FREVERSE)
594                 ? pos2start[t_end]
595                 : pos2end[t_end];
596     } else {
597         // Not paired (see int anum = (REV || !PAIR) ?en :st expr above)
598         oth_anum = pos2start[start];
599         t_end = end;
600     }
601 
602     // We don't want to count our pairs twice.
603     // If both left/right are known, count it on left only.
604     // If only one is known, we'll only get to this code once
605     // so we can also count it.
606     int astatus = 2;
607     if (anum != -1 && oth_anum != -1) {
608         astatus = oth_anum == anum ? 0 : 1;
609         if (start <= t_end)
610             stats->amp_dist[anum][astatus]++;
611     } else if (anum >= 0) {
612         stats->amp_dist[anum][astatus = 2]++;
613     }
614 
615     if (astatus == 0 && !(b->core.flag & (BAM_FUNMAP | BAM_FMUNMAP))) {
616         if (prev_end && mstart > prev_end) {
617             // 2nd read with gap to 1st; undo previous increment.
618             for (i = prev_start; i < prev_end; i++)
619                 stats->depth_valid[i]--;
620             stats->nfull_reads[anum] -= (b->core.flag & BAM_FPAIRED) ? 0.5 : 1;
621         } else {
622             // 1st read, or 2nd read that overlaps 1st
623             for (i = mstart; i < end; i++)
624                 stats->depth_valid[i]++;
625             stats->nfull_reads[anum] += (b->core.flag & BAM_FPAIRED) ? 0.5 : 1;
626         }
627     }
628 
629     // Track template start,end frequencies, so we can give stats on
630     // amplicon primer usage.
631     if ((b->core.flag & BAM_FPAIRED) && b->core.isize <= 0)
632         // left to right only, so we don't double count template positions.
633         return 0;
634 
635     start = b->core.pos;
636     t_end = b->core.flag & BAM_FPAIRED
637         ? start + b->core.isize-1
638         : end;
639     uint64_t tcoord = MIN(start+1, UINT32_MAX) | (MIN(t_end+1, UINT32_MAX)<<32);
640     k = kh_put(tcoord, stats->tcoord[anum+1], tcoord, &ret);
641     if (ret < 0)
642         return -1;
643     if (ret == 0)
644         kh_value(stats->tcoord[anum+1], k)++;
645     else
646         kh_value(stats->tcoord[anum+1], k)=1;
647     kh_value(stats->tcoord[anum+1], k) |= ((int64_t)astatus<<32);
648 
649     return 0;
650 }
651 
652 // Append file local stats to global stats
append_lstats(astats_t * lstats,astats_t * gstats,int namp,int all_nseq)653 int append_lstats(astats_t *lstats, astats_t *gstats, int namp, int all_nseq) {
654     gstats->nseq += lstats->nseq;
655     gstats->nfiltered += lstats->nfiltered;
656     gstats->nfailprimer += lstats->nfailprimer;
657 
658     int a;
659     for (a = -1; a < namp; a++) {
660         // Add khash local (kl) to khash global (kg)
661         khiter_t kl, kg;
662         for (kl = kh_begin(lstats->tcoord[a+1]);
663              kl != kh_end(lstats->tcoord[a+1]); kl++) {
664             if (!kh_exist(lstats->tcoord[a+1], kl) ||
665                 kh_value(lstats->tcoord[a+1], kl) == 0)
666                 continue;
667 
668             int ret;
669             kg = kh_put(tcoord, gstats->tcoord[a+1],
670                         kh_key(lstats->tcoord[a+1], kl),
671                         &ret);
672             if (ret < 0)
673                 return -1;
674 
675             kh_value(gstats->tcoord[a+1], kg) =
676                 (ret == 0
677                  ? (kh_value(gstats->tcoord[a+1], kg) & 0xFFFFFFFF)
678                  : 0)
679                 + kh_value(lstats->tcoord[a+1], kl);
680         }
681         if (a == -1) continue;
682 
683         gstats->nreads[a]  += lstats->nreads[a];
684         gstats->nreads2[a] += lstats->nreads[a] * lstats->nreads[a];
685         gstats->nfull_reads[a] += lstats->nfull_reads[a];
686 
687         // To get mean & sd for amplicon read percentage, we need
688         // to do the divisions here as nseq differs for each sample.
689         double nrperc = all_nseq ? 100.0 * lstats->nreads[a] / all_nseq : 0;
690         gstats->nrperc[a]  += nrperc;
691         gstats->nrperc2[a] += nrperc*nrperc;
692 
693         gstats->nbases[a]  += lstats->nbases[a];
694         gstats->nbases2[a] += lstats->nbases[a] * lstats->nbases[a];
695 
696         int d;
697         for (d = 0; d < MAX_DEPTH; d++) {
698             gstats->covered_perc[a][d]  += lstats->covered_perc[a][d];
699             gstats->covered_perc2[a][d] += lstats->covered_perc[a][d]
700                                          * lstats->covered_perc[a][d];
701         }
702 
703         for (d = 0; d < 3; d++)
704             gstats->amp_dist[a][d] += lstats->amp_dist[a][d];
705     }
706 
707     for (a = 0; a < lstats->max_len; a++) {
708         gstats->depth_valid[a] += lstats->depth_valid[a];
709         gstats->depth_all[a]   += lstats->depth_all[a];
710     }
711 
712     return 0;
713 }
714 
append_stats(amplicons_t * amps,int nref)715 int append_stats(amplicons_t *amps, int nref) {
716     int i, r, all_nseq = 0;
717     for (r = 0; r < nref; r++) {
718         if (!amps[r].sites)
719             continue;
720         astats_t *stats = amps[r].lstats;
721         all_nseq  += stats->nseq - stats->nfiltered - stats->nfailprimer;
722     }
723 
724     for (i = 0; i < nref; i++) {
725         if (!amps[i].sites)
726             continue;
727         if (append_lstats(amps[i].lstats, amps[i].gstats, amps[i].namp,
728                           all_nseq) < 0)
729             return -1;
730     }
731 
732     return 0;
733 }
734 
735 typedef struct {
736     int32_t start, end;
737     uint32_t freq;
738     uint32_t status;
739 } tcoord_t;
740 
741 // Sort tcoord by descending frequency and then ascending start and  end.
tcoord_freq_sort(const void * vp1,const void * vp2)742 static int tcoord_freq_sort(const void *vp1, const void *vp2) {
743     const tcoord_t *t1 = (const tcoord_t *)vp1;
744     const tcoord_t *t2 = (const tcoord_t *)vp2;
745 
746     if (t1->freq != t2->freq)
747         return t2->freq - t1->freq;
748 
749     if (t1->start != t2->start)
750         return t1->start - t2->start;
751 
752     return t1->end - t2->end;
753 }
754 
755 
756 /*
757  * Merges tcoord start,end,freq,status tuples if their coordinates are
758  * close together.  We aim to keep the start,end for the most frequent
759  * value and assume that is the correct coordinate and all others are
760  * minor fluctuations due to errors or variants.
761  *
762  * We sort by frequency first and then merge later items in the list into
763  * the earlier more frequent ones.  It's O(N^2), but sufficient for now
764  * given current scale of projects.
765  *
766  * If we ever need to resolve that then consider sorting by start
767  * coordinate and scanning the list to find all items within X, find
768  * the most frequent of those, and then cluster that way.  (I'd have
769  * done that had I thought of it at the time!)
770  */
aggregate_tcoord(astats_args_t * args,tcoord_t * tpos,size_t * np)771 static void aggregate_tcoord(astats_args_t *args, tcoord_t *tpos, size_t *np){
772     size_t n = *np, j, j2, j3, k;
773 
774     // Sort by frequency and cluster infrequent coords into frequent
775     // ones provided they're close by.
776     // This is O(N^2), but we've already binned by tcoord_bin/2 so
777     // the list isn't intended to be vast at this point.
778     qsort(tpos, n, sizeof(*tpos), tcoord_freq_sort);
779 
780     // For frequency ties, find mid start coord, and then find mid end
781     // coord of those matching start.
782     // We make that the first item so we merge into that mid point.
783     for (j = 0; j < n; j++) {
784         for (j2 = j+1; j2 < n; j2++) {
785             if (tpos[j].freq != tpos[j2].freq)
786                 break;
787             if (tpos[j2].start - tpos[j].start >= args->tcoord_bin)
788                 break;
789         }
790 
791         // j to j2 all within bin of a common start,
792         // m is the mid start.
793         if (j2-1 > j) {
794             size_t m = (j2-1 + j)/2;
795 
796             // Find mid end for this same start
797             while (m > 1 && tpos[m].start == tpos[m-1].start)
798                 m--;
799             for (j3 = m+1; j3 < j2; j3++) {
800                 if (tpos[m].start != tpos[j3].start)
801                     break;
802                 if (tpos[m].end - tpos[j3].end >= args->tcoord_bin)
803                     break;
804             }
805             if (j3-1 > m)
806                 m = (j3-1 + m)/2;
807 
808             // Swap with first item.
809             tcoord_t tmp = tpos[j];
810             tpos[j] = tpos[m];
811             tpos[m] = tmp;
812             j = j2-1;
813         }
814     }
815 
816     // Now merge in coordinates.
817     // This bit is O(N^2), so consider binning first to reduce the
818     // size of the list if we have excessive positional variation.
819     for (k = j = 0; j < n; j++) {
820         if (!tpos[j].freq)
821             continue;
822 
823         if (k < j)
824             tpos[k] = tpos[j];
825 
826         for (j2 = j+1; j2 < n; j2++) {
827             if (ABS(tpos[j].start-tpos[j2].start) < args->tcoord_bin/2 &&
828                 ABS(tpos[j].end  -tpos[j2].end)  < args->tcoord_bin/2 &&
829                 tpos[j].status == tpos[j2].status) {
830                 tpos[k].freq += tpos[j2].freq;
831                 tpos[j2].freq = 0;
832             }
833         }
834         k++;
835     }
836 
837     *np = k;
838 }
839 
dump_stats(astats_args_t * args,char type,char * name,int nfile,amplicons_t * amps,int nref,int local)840 int dump_stats(astats_args_t *args, char type, char *name, int nfile,
841                amplicons_t *amps, int nref, int local) {
842     int i, r;
843     FILE *ofp = args->out_fp;
844     tcoord_t *tpos = NULL;
845     size_t ntcoord = 0;
846 
847     // summary stats for this sample (or for all samples)
848     fprintf(ofp, "# Summary stats.\n");
849     fprintf(ofp, "# Use 'grep ^%cSS | cut -f 2-' to extract this part.\n", type);
850 
851     for (r = 0; r < nref; r++) {
852         if (!amps[r].sites)
853             continue;
854         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
855         int nmatch = stats->nseq - stats->nfiltered - stats->nfailprimer;
856         char *name_ref = malloc(strlen(name) + strlen(amps[r].ref) + 2);
857         if (!name_ref)
858             return -1;
859         if (args->multi_ref)
860             sprintf(name_ref, "%s\t%s", name, amps[r].ref);
861         else
862             sprintf(name_ref, "%s", name);
863         fprintf(ofp, "%cSS\t%s\traw total sequences:\t%d\n",
864                 type, name_ref, stats->nseq);
865         fprintf(ofp, "%cSS\t%s\tfiltered sequences:\t%d\n",
866                 type, name_ref, stats->nfiltered);
867         fprintf(ofp, "%cSS\t%s\tfailed primer match:\t%d\n",
868                 type, name_ref, stats->nfailprimer);
869         fprintf(ofp, "%cSS\t%s\tmatching sequences:\t%d\n",
870                 type, name_ref, nmatch);
871 
872         int d = 0;
873         do {
874             // From first to last amplicon only, so not entire consensus.
875             // If contig length is known, maybe we want to add the missing
876             // count to < DEPTH figures?
877             int64_t start = 0, covered = 0, total = 0;
878             amplicon_t *amp = amps[r].amp;
879             for (i = 0; i < amps[r].namp; i++) {
880                 int64_t j, offset = amp[i].min_left-1;
881                 if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) {
882                     fprintf(stderr, "[ampliconstats] error: "
883                             "Maximum amplicon length (%d) exceeded for '%s'\n",
884                             stats->max_amp, name);
885                     return -1;
886                 }
887                 for (j = MAX(start, amp[i].max_left-1);
888                      j < MAX(start, amp[i].min_right); j++) {
889                     if (stats->coverage[i*stats->max_amp_len + j-offset]
890                         >= args->min_depth[d])
891                         covered++;
892                     total++;
893                 }
894                 start = MAX(start, amp[i].min_right);
895             }
896             fprintf(ofp, "%cSS\t%s\tconsensus depth count < %d and >= %d:\t%"
897                     PRId64"\t%"PRId64"\n", type, name_ref,
898                     args->min_depth[d], args->min_depth[d],
899                     total-covered, covered);
900         } while (++d < MAX_DEPTH && args->min_depth[d]);
901 
902         free(name_ref);
903     }
904 
905     // Read count
906     fprintf(ofp, "# Absolute matching read counts per amplicon.\n");
907     fprintf(ofp, "# Use 'grep ^%cREADS | cut -f 2-' to extract this part.\n", type);
908     fprintf(ofp, "%cREADS\t%s", type, name);
909     for (r = 0; r < nref; r++) {
910         if (!amps[r].sites)
911             continue;
912         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
913         for (i = 0; i < amps[r].namp; i++) {
914             fprintf(ofp, "\t%"PRId64, stats->nreads[i]);
915         }
916     }
917     fprintf(ofp, "\n");
918 
919     // Valid depth is the number of full length reads (already divided
920     // by the number we expect to cover), so +0.5 per read in pair.
921     // A.k.a "usable depth" in the plots.
922     fprintf(ofp, "%cVDEPTH\t%s", type, name);
923     for (r = 0; r < nref; r++) {
924         if (!amps[r].sites)
925             continue;
926         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
927         for (i = 0; i < amps[r].namp; i++)
928             fprintf(ofp, "\t%d", (int)stats->nfull_reads[i]);
929     }
930     fprintf(ofp, "\n");
931 
932     if (type == 'C') {
933         // For combined we can compute mean & standard deviation too
934         fprintf(ofp, "CREADS\tMEAN");
935         for (r = 0; r < nref; r++) {
936             if (!amps[r].sites)
937                 continue;
938             astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
939             for (i = 0; i < amps[r].namp; i++) {
940                 fprintf(ofp, "\t%.1f", stats->nreads[i] / (double)nfile);
941             }
942         }
943         fprintf(ofp, "\n");
944 
945         fprintf(ofp, "CREADS\tSTDDEV");
946         for (r = 0; r < nref; r++) {
947             if (!amps[r].sites)
948                 continue;
949             astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
950             for (i = 0; i < amps[r].namp; i++) {
951                 double n1 = stats->nreads[i];
952                 fprintf(ofp, "\t%.1f", nfile > 1 && stats->nreads2[i] > 0
953                         ? sqrt(stats->nreads2[i]/(double)nfile
954                                - (n1/nfile)*(n1/nfile))
955                         : 0);
956             }
957         }
958         fprintf(ofp, "\n");
959     }
960 
961     fprintf(ofp, "# Read percentage of distribution between amplicons.\n");
962     fprintf(ofp, "# Use 'grep ^%cRPERC | cut -f 2-' to extract this part.\n", type);
963     fprintf(ofp, "%cRPERC\t%s", type, name);
964     int all_nseq = 0;
965     for (r = 0; r < nref; r++) {
966         if (!amps[r].sites)
967             continue;
968         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
969         all_nseq  += stats->nseq - stats->nfiltered - stats->nfailprimer;
970     }
971     for (r = 0; r < nref; r++) {
972         if (!amps[r].sites)
973             continue;
974         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
975         for (i = 0; i < amps[r].namp; i++) {
976             if (type == 'C') {
977                 fprintf(ofp, "\t%.3f", (double)stats->nrperc[i] / nfile);
978             } else {
979                 fprintf(ofp, "\t%.3f",
980                         all_nseq ? 100.0 * stats->nreads[i] / all_nseq : 0);
981             }
982         }
983     }
984     fprintf(ofp, "\n");
985 
986     if (type == 'C') {
987         // For combined we compute mean and standard deviation too
988         fprintf(ofp, "CRPERC\tMEAN");
989         for (r = 0; r < nref; r++) {
990             if (!amps[r].sites)
991                 continue;
992             astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
993             for (i = 0; i < amps[r].namp; i++) {
994                 fprintf(ofp, "\t%.3f", stats->nrperc[i] / nfile);
995             }
996         }
997         fprintf(ofp, "\n");
998 
999         fprintf(ofp, "CRPERC\tSTDDEV");
1000         for (r = 0; r < nref; r++) {
1001             if (!amps[r].sites)
1002                 continue;
1003             astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1004             for (i = 0; i < amps[r].namp; i++) {
1005                 // variance = SUM(X^2) - ((SUM(X)^2) / N)
1006                 double n1 = stats->nrperc[i];
1007                 double v = stats->nrperc2[i]/nfile - (n1/nfile)*(n1/nfile);
1008                 fprintf(ofp, "\t%.3f", v>0?sqrt(v):0);
1009             }
1010         }
1011         fprintf(ofp, "\n");
1012     }
1013 
1014     // Base depth
1015     fprintf(ofp, "# Read depth per amplicon.\n");
1016     fprintf(ofp, "# Use 'grep ^%cDEPTH | cut -f 2-' to extract this part.\n", type);
1017     fprintf(ofp, "%cDEPTH\t%s", type, name);
1018     for (r = 0; r < nref; r++) {
1019         if (!amps[r].sites)
1020             continue;
1021         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1022         amplicon_t *amp = amps[r].amp;
1023         for (i = 0; i < amps[r].namp; i++) {
1024             int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer;
1025             int64_t alen = amp[i].min_right - amp[i].max_left+1;
1026             fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen : 0);
1027         }
1028     }
1029     fprintf(ofp, "\n");
1030 
1031     if (type == 'C') {
1032         // For combined we can compute mean & standard deviation too
1033         fprintf(ofp, "CDEPTH\tMEAN");
1034         for (r = 0; r < nref; r++) {
1035             if (!amps[r].sites)
1036                 continue;
1037             astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1038             amplicon_t *amp = amps[r].amp;
1039             int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer;
1040             for (i = 0; i < amps[r].namp; i++) {
1041                 int64_t alen = amp[i].min_right - amp[i].max_left+1;
1042                 fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen / nfile : 0);
1043             }
1044         }
1045         fprintf(ofp, "\n");
1046 
1047         fprintf(ofp, "CDEPTH\tSTDDEV");
1048         for (r = 0; r < nref; r++) {
1049             if (!amps[r].sites)
1050                 continue;
1051             astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1052             amplicon_t *amp = amps[r].amp;
1053             for (i = 0; i < amps[r].namp; i++) {
1054                 double alen = amp[i].min_right - amp[i].max_left+1;
1055                 double n1 = stats->nbases[i] / alen;
1056                 double v = stats->nbases2[i] / (alen*alen) /nfile
1057                     - (n1/nfile)*(n1/nfile);
1058                 fprintf(ofp, "\t%.1f", v>0?sqrt(v):0);
1059             }
1060         }
1061         fprintf(ofp, "\n");
1062     }
1063 
1064     // Percent Coverage
1065     if (type == 'F') {
1066         fprintf(ofp, "# Percentage coverage per amplicon\n");
1067         fprintf(ofp, "# Use 'grep ^%cPCOV | cut -f 2-' to extract this part.\n", type);
1068         int d = 0;
1069         do {
1070             fprintf(ofp, "%cPCOV-%d\t%s", type, args->min_depth[d], name);
1071 
1072             for (r = 0; r < nref; r++) {
1073                 if (!amps[r].sites)
1074                     continue;
1075                 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1076                 amplicon_t *amp = amps[r].amp;
1077                 for (i = 0; i < amps[r].namp; i++) {
1078                     int covered = 0;
1079                     if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) {
1080                         fprintf(stderr, "[ampliconstats] error: "
1081                                 "Maximum amplicon length (%d) exceeded for '%s'\n",
1082                                 stats->max_amp, name);
1083                         return -1;
1084                     }
1085                     int64_t j, offset = amp[i].min_left-1;
1086                     for (j = amp[i].max_left-1; j < amp[i].min_right; j++) {
1087                         int apos = i*stats->max_amp_len + j-offset;
1088                         if (stats->coverage[apos] >= args->min_depth[d])
1089                             covered++;
1090                     }
1091                     int64_t alen = amp[i].min_right - amp[i].max_left+1;
1092                     stats->covered_perc[i][d] = 100.0 * covered / alen;
1093                     fprintf(ofp, "\t%.2f", 100.0 * covered / alen);
1094                 }
1095             }
1096             fprintf(ofp, "\n");
1097         } while (++d < MAX_DEPTH && args->min_depth[d]);
1098 
1099     } else if (type == 'C') {
1100         // For combined we can compute mean & standard deviation too
1101         int d = 0;
1102         do {
1103             fprintf(ofp, "CPCOV-%d\tMEAN", args->min_depth[d]);
1104             for (r = 0; r < nref; r++) {
1105                 if (!amps[r].sites)
1106                     continue;
1107                 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1108                 for (i = 0; i < amps[r].namp; i++) {
1109                     fprintf(ofp, "\t%.1f", stats->covered_perc[i][d] / nfile);
1110                 }
1111             }
1112             fprintf(ofp, "\n");
1113 
1114             fprintf(ofp, "CPCOV-%d\tSTDDEV", args->min_depth[d]);
1115             for (r = 0; r < nref; r++) {
1116                 if (!amps[r].sites)
1117                     continue;
1118                 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1119                 for (i = 0; i < amps[r].namp; i++) {
1120                     double n1 = stats->covered_perc[i][d] / nfile;
1121                     double v = stats->covered_perc2[i][d] / nfile - n1*n1;
1122                     fprintf(ofp, "\t%.1f", v>0?sqrt(v):0);
1123                 }
1124             }
1125             fprintf(ofp, "\n");
1126         } while (++d < MAX_DEPTH && args->min_depth[d]);
1127     }
1128 
1129     // Plus base depth for all reads, irrespective of amplicon.
1130     // This is post overlap removal, if reads in the read-pair overlap.
1131     fprintf(ofp, "# Depth per reference base for ALL data.\n");
1132     fprintf(ofp, "# Use 'grep ^%cDP_ALL | cut -f 2-' to extract this part.\n",
1133             type);
1134     for (r = 0; r < nref; r++) {
1135         if (!amps[r].sites)
1136             continue;
1137         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1138         if (args->multi_ref)
1139             fprintf(ofp, "%cDP_ALL\t%s\t%s", type, name, amps[r].ref);
1140         else
1141             fprintf(ofp, "%cDP_ALL\t%s", type, name);
1142 
1143         for (i = 0; i < amps[r].len; i++) {
1144             // Basic run-length encoding provided all values are within
1145             // +- depth_bin fraction of the mid-point.
1146             int dmin = stats->depth_all[i], dmax = stats->depth_all[i], j;
1147             double dmid = (dmin + dmax)/2.0;
1148             double low  = dmid*(1-args->depth_bin);
1149             double high = dmid*(1+args->depth_bin);
1150             for (j = i+1; j < amps[r].len; j++) {
1151                 int d = stats->depth_all[j];
1152                 if (d < low || d > high)
1153                     break;
1154                 if (dmin > d) {
1155                     dmin = d;
1156                     dmid = (dmin + dmax)/2.0;
1157                     low  = dmid*(1-args->depth_bin);
1158                     high = dmid*(1+args->depth_bin);
1159                 } else if (dmax < d) {
1160                     dmax = d;
1161                     dmid = (dmin + dmax)/2.0;
1162                     low  = dmid*(1-args->depth_bin);
1163                     high = dmid*(1+args->depth_bin);
1164                 }
1165             }
1166             fprintf(ofp, "\t%d,%d", (int)dmid, j-i);
1167             i = j-1;
1168         }
1169         fprintf(ofp, "\n");
1170     }
1171 
1172     // And depth for only reads matching to a single amplicon for full
1173     // length.  This is post read overlap removal.
1174     fprintf(ofp, "# Depth per reference base for full-length valid amplicon data.\n");
1175     fprintf(ofp, "# Use 'grep ^%cDP_VALID | cut -f 2-' to extract this "
1176             "part.\n", type);
1177     for (r = 0; r < nref; r++) {
1178         if (!amps[r].sites)
1179             continue;
1180         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1181         if (args->multi_ref)
1182             fprintf(ofp, "%cDP_VALID\t%s\t%s", type, name, amps[r].ref);
1183         else
1184             fprintf(ofp, "%cDP_VALID\t%s", type, name);
1185 
1186         for (i = 0; i < amps[r].len; i++) {
1187             int dmin = stats->depth_valid[i], dmax = stats->depth_valid[i], j;
1188             double dmid = (dmin + dmax)/2.0;
1189             double low  = dmid*(1-args->depth_bin);
1190             double high = dmid*(1+args->depth_bin);
1191             for (j = i+1; j < amps[r].len; j++) {
1192                 int d = stats->depth_valid[j];
1193                 if (d < low || d > high)
1194                     break;
1195                 if (dmin > d) {
1196                     dmin = d;
1197                     dmid = (dmin + dmax)/2.0;
1198                     low  = dmid*(1-args->depth_bin);
1199                     high = dmid*(1+args->depth_bin);
1200                 } else if (dmax < d) {
1201                     dmax = d;
1202                     dmid = (dmin + dmax)/2.0;
1203                     low  = dmid*(1-args->depth_bin);
1204                     high = dmid*(1+args->depth_bin);
1205                 }
1206             }
1207             fprintf(ofp, "\t%d,%d", (int)dmid, j-i);
1208             i = j-1;
1209         }
1210         fprintf(ofp, "\n");
1211     }
1212 
1213     // TCOORD (start to end) distribution
1214     fprintf(ofp, "# Distribution of aligned template coordinates.\n");
1215     fprintf(ofp, "# Use 'grep ^%cTCOORD | cut -f 2-' to extract this part.\n", type);
1216     for (r = 0; r < nref; r++) {
1217         if (!amps[r].sites)
1218             continue;
1219         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1220         for (i = 0 - (nref==1); i < amps[r].namp; i++) {
1221             if (ntcoord < kh_size(stats->tcoord[i+1])) {
1222                 ntcoord = kh_size(stats->tcoord[i+1]);
1223                 tcoord_t *tmp = realloc(tpos, ntcoord * sizeof(*tmp));
1224                 if (!tmp) {
1225                     free(tpos);
1226                     return -1;
1227                 }
1228                 tpos = tmp;
1229             }
1230 
1231             khiter_t k;
1232             size_t n = 0, j;
1233             for (k = kh_begin(stats->tcoord[i+1]);
1234                  k != kh_end(stats->tcoord[i+1]); k++) {
1235                 if (!kh_exist(stats->tcoord[i+1], k) ||
1236                     (kh_value(stats->tcoord[i+1], k) & 0xFFFFFFFF) == 0)
1237                     continue;
1238                 // Key is start,end in 32-bit quantities.
1239                 // Yes this limits us to 4Gb references, but just how
1240                 // many primers are we planning on making?  Not that many
1241                 // I hope.
1242                 tpos[n].start = kh_key(stats->tcoord[i+1], k)&0xffffffff;
1243                 tpos[n].end   = kh_key(stats->tcoord[i+1], k)>>32;
1244 
1245                 // Value is frequency (top 32-bits) and status (bottom 32).
1246                 tpos[n].freq   = kh_value(stats->tcoord[i+1], k)&0xffffffff;
1247                 tpos[n].status = kh_value(stats->tcoord[i+1], k)>>32;
1248                 n++;
1249             }
1250 
1251             if (args->tcoord_bin > 1)
1252                 aggregate_tcoord(args, tpos, &n);
1253 
1254             fprintf(ofp, "%cTCOORD\t%s\t%d", type, name,
1255                     i+1+amps[r].first_amp); // per amplicon
1256             for (j = 0; j < n; j++) {
1257                 if (tpos[j].freq < args->tcoord_min_count)
1258                     continue;
1259                 fprintf(ofp, "\t%d,%d,%u,%u",
1260                         tpos[j].start,
1261                         tpos[j].end,
1262                         tpos[j].freq,
1263                         tpos[j].status);
1264             }
1265             fprintf(ofp, "\n");
1266         }
1267     }
1268 
1269 
1270     // AMP length distribution.
1271     // 0 = both ends in this amplicon
1272     // 1 = ends in different amplicons
1273     // 2 = other end matching an unknown amplicon site
1274     //     (see tcoord for further analysis of where)
1275     fprintf(ofp, "# Classification of amplicon status.  Columns are\n");
1276     fprintf(ofp, "# number with both primers from this amplicon, number with\n");
1277     fprintf(ofp, "# primers from different amplicon, and number with a position\n");
1278     fprintf(ofp, "# not matching any valid amplicon primer site\n");
1279     fprintf(ofp, "# Use 'grep ^%cAMP | cut -f 2-' to extract this part.\n", type);
1280 
1281     fprintf(ofp, "%cAMP\t%s\t0", type, name); // all merged
1282     int amp_dist[3] = {0};
1283     for (r = 0; r < nref; r++) {
1284         if (!amps[r].sites)
1285             continue;
1286         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1287         for (i = 0; i < amps[r].namp; i++) { // accumulate for all amps
1288             amp_dist[0] += stats->amp_dist[i][0];
1289             amp_dist[1] += stats->amp_dist[i][1];
1290             amp_dist[2] += stats->amp_dist[i][2];
1291         }
1292     }
1293     fprintf(ofp, "\t%d\t%d\t%d\n", amp_dist[0], amp_dist[1], amp_dist[2]);
1294 
1295     for (r = 0; r < nref; r++) {
1296         if (!amps[r].sites)
1297             continue;
1298         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1299         for (i = 0; i < amps[r].namp; i++) {
1300             // per amplicon
1301             fprintf(ofp, "%cAMP\t%s\t%d", type, name, i+1+amps[r].first_amp);
1302             fprintf(ofp, "\t%d\t%d\t%d\n", stats->amp_dist[i][0],
1303                     stats->amp_dist[i][1], stats->amp_dist[i][2]);
1304         }
1305     }
1306 
1307     free(tpos);
1308     return 0;
1309 }
1310 
dump_lstats(astats_args_t * args,char type,char * name,int nfile,amplicons_t * amps,int nref)1311 int dump_lstats(astats_args_t *args, char type, char *name, int nfile,
1312                amplicons_t *amps, int nref) {
1313     return dump_stats(args, type, name, nfile, amps, nref, 1);
1314 }
1315 
dump_gstats(astats_args_t * args,char type,char * name,int nfile,amplicons_t * amps,int nref)1316 int dump_gstats(astats_args_t *args, char type, char *name, int nfile,
1317                amplicons_t *amps, int nref) {
1318     return dump_stats(args, type, name, nfile, amps, nref, 0);
1319 }
1320 
get_sample_name(sam_hdr_t * header,char * RG)1321 char const *get_sample_name(sam_hdr_t *header, char *RG) {
1322     kstring_t ks = {0};
1323     sam_hdr_find_tag_id(header, "RG", RG?"ID":NULL, RG, "SM", &ks);
1324     return ks.s;
1325 }
1326 
1327 // Return maximum reference length (SQ is NULL) or the length
1328 // of the specified reference in SQ.
get_ref_len(sam_hdr_t * header,const char * SQ)1329 int64_t get_ref_len(sam_hdr_t *header, const char *SQ) {
1330     if (SQ) {
1331         int tid = SQ ? sam_hdr_name2tid(header, SQ) : 0;
1332         return tid >= 0 ? sam_hdr_tid2len(header, tid) : -1;
1333     } else {
1334         int nref = sam_hdr_nref(header), tid;;
1335         int64_t len = 0;
1336         for (tid = 0; tid < nref; tid++) {
1337             int64_t rl = sam_hdr_tid2len(header, tid);
1338             if (len < rl)
1339                 len = rl;
1340         }
1341         return len;
1342     }
1343 }
1344 
amplicon_stats(astats_args_t * args,khash_t (bed_list_hash)* bed_hash,char ** filev,int filec)1345 static int amplicon_stats(astats_args_t *args,
1346                           khash_t(bed_list_hash) *bed_hash,
1347                           char **filev, int filec) {
1348     int i, ref = -1, ref_tid = -1, ret = -1, nref = 0;
1349     samFile *fp = NULL;
1350     sam_hdr_t *header = NULL;
1351     bam1_t *b = bam_init1();
1352     FILE *ofp = args->out_fp;
1353     char sname_[8192], *sname = NULL;
1354     amplicons_t *amps = NULL;
1355 
1356     // Report initial SS header.  We gather data from the bed_hash entries
1357     // as well as from the first SAM header (with the requirement that all
1358     // headers should be compatible).
1359     if (filec) {
1360         if (!(fp = sam_open_format(filev[0], "r", &args->ga.in))) {
1361             print_error_errno("ampliconstats",
1362                               "Cannot open input file \"%s\"",
1363                               filev[0]);
1364             goto err;
1365         }
1366         if (!(header = sam_hdr_read(fp)))
1367             goto err;
1368 
1369         if (!amps) {
1370             amps = calloc(nref=sam_hdr_nref(header), sizeof(*amps));
1371             if (!amps)
1372                 goto err;
1373             fprintf(ofp, "# Summary statistics, used for scaling the plots.\n");
1374             fprintf(ofp, "SS\tSamtools version: %s\n", samtools_version());
1375             fprintf(ofp, "SS\tCommand line: %s\n", args->argv);
1376             fprintf(ofp, "SS\tNumber of files:\t%d\n", filec);
1377 
1378             // Note: order of hash entries will be different to order of
1379             // BED file which may also differ to order of SQ headers.
1380             // SQ header is canonical ordering (pos sorted file).
1381             khiter_t k;
1382             int bam_nref = sam_hdr_nref(header);
1383             for (i = 0; i < bam_nref; i++) {
1384                 k = kh_get(bed_list_hash, bed_hash,
1385                            sam_hdr_tid2name(header, i));
1386                 if (!kh_exist(bed_hash, k))
1387                     continue;
1388 
1389                 bed_entry_list_t *sites = &kh_value(bed_hash, k);
1390 
1391                 ref = i;
1392                 amps[ref].ref = kh_key(bed_hash, k);
1393                 amps[ref].sites = sites;
1394                 amps[ref].namp = count_amplicon(sites);
1395                 amps[ref].amp  = calloc(sites->length,
1396                                         sizeof(*amps[ref].amp));
1397                 if (!amps[ref].amp)
1398                     goto err;
1399                 if (args->multi_ref)
1400                     fprintf(ofp, "SS\tNumber of amplicons:\t%s\t%d\n",
1401                             kh_key(bed_hash, k), amps[ref].namp);
1402                 else
1403                     fprintf(ofp, "SS\tNumber of amplicons:\t%d\n",
1404                             amps[ref].namp);
1405 
1406                 amps[ref].tid = ref;
1407                 if (ref_tid == -1)
1408                     ref_tid = ref;
1409 
1410                 int64_t len = get_ref_len(header, kh_key(bed_hash, k));
1411                 amps[ref].len = len;
1412                 if (args->multi_ref)
1413                     fprintf(ofp, "SS\tReference length:\t%s\t%"PRId64"\n",
1414                             kh_key(bed_hash, k), len);
1415                 else
1416                     fprintf(ofp, "SS\tReference length:\t%"PRId64"\n",
1417                             len);
1418 
1419                 amps[ref].lstats = stats_alloc(len, args->max_amp,
1420                                                args->max_amp_len);
1421                 amps[ref].gstats = stats_alloc(len, args->max_amp,
1422                                                args->max_amp_len);
1423                 if (!amps[ref].lstats || !amps[ref].gstats)
1424                     goto err;
1425             }
1426         }
1427 
1428         sam_hdr_destroy(header);
1429         header = NULL;
1430         if (sam_close(fp) < 0) {
1431             fp = NULL;
1432             goto err;
1433         }
1434         fp = NULL;
1435     }
1436     fprintf(ofp, "SS\tEnd of summary\n");
1437 
1438     // Extract the bits of amplicon data we need from bed hash and turn
1439     // it into a position-to-amplicon lookup table.
1440     int offset = 0;
1441     for (i = 0; i < nref; i++) {
1442         if (!amps[i].sites)
1443             continue;
1444 
1445         amps[i].first_amp = offset;
1446         if (bed2amplicon(args, amps[i].sites, amps[i].amp,
1447                          &amps[i].namp, i==0, amps[i].ref, offset) < 0)
1448             goto err;
1449 
1450         offset += amps[i].namp; // cumulative amplicon number across refs
1451     }
1452 
1453     // Now iterate over file contents, one at a time.
1454     for (i = 0; i < filec; i++) {
1455         char *nstart = filev[i];
1456 
1457         fp = sam_open_format(filev[i], "r", &args->ga.in);
1458         if (!fp) {
1459             print_error_errno("ampliconstats",
1460                               "Cannot open input file \"%s\"",
1461                               filev[i]);
1462             goto err;
1463         }
1464 
1465         if (args->ga.nthreads > 0)
1466             hts_set_threads(fp, args->ga.nthreads);
1467 
1468         if (!(header = sam_hdr_read(fp)))
1469             goto err;
1470 
1471         if (nref != sam_hdr_nref(header)) {
1472             print_error_errno("ampliconstats",
1473                               "SAM headers are not consistent across input files");
1474             goto err;
1475         }
1476         int r;
1477         for (r = 0; r < nref; r++) {
1478             if (!amps[r].ref ||
1479                 strcmp(amps[r].ref, sam_hdr_tid2name(header, r)) != 0 ||
1480                 amps[r].len != sam_hdr_tid2len(header, r)) {
1481                 print_error_errno("ampliconstats",
1482                                   "SAM headers are not consistent across "
1483                                   "input files");
1484                 goto err;
1485             }
1486         }
1487 
1488         if (args->use_sample_name)
1489             sname = (char *)get_sample_name(header, NULL);
1490 
1491         if (!sname) {
1492             sname = sname_;
1493             char *nend = filev[i] + strlen(filev[i]), *cp;
1494             if ((cp = strrchr(filev[i], '/')))
1495                 nstart = cp+1;
1496             if ((cp = strrchr(nstart, '.')) &&
1497                 (strcmp(cp, ".bam") == 0 ||
1498                  strcmp(cp, ".sam") == 0 ||
1499                  strcmp(cp, ".cram") == 0))
1500                 nend = cp;
1501             if (nend - nstart >= 8192) nend = nstart+8191;
1502             memcpy(sname, nstart, nend-nstart);
1503             sname[nend-nstart] = 0;
1504         }
1505 
1506         // Stats local to this sample only
1507         amp_stats_reset(amps, nref);
1508 
1509         int last_ref = -9;
1510         while ((r = sam_read1(fp, header, b)) >= 0) {
1511             // Other filter options useful here?
1512             if (b->core.tid < 0)
1513                 continue;
1514 
1515             if (last_ref != b->core.tid) {
1516                 last_ref  = b->core.tid;
1517                 if (initialise_amp_pos_lookup(args, amps, last_ref) < 0)
1518                     goto err;
1519             }
1520 
1521             if (accumulate_stats(args, amps, b) < 0)
1522                 goto err;
1523         }
1524 
1525         if (r < -1) {
1526             print_error_errno("ampliconstats", "Fail reading record");
1527             goto err;
1528         }
1529 
1530         sam_hdr_destroy(header);
1531         if (sam_close(fp) < 0) {
1532             fp = NULL;
1533             goto err;
1534         }
1535 
1536         fp = NULL;
1537         header = NULL;
1538 
1539         if (dump_lstats(args, 'F', sname, filec, amps, nref) < 0)
1540             goto err;
1541 
1542         if (append_stats(amps, nref) < 0)
1543             goto err;
1544 
1545         if (sname && sname != sname_)
1546             free(sname);
1547         sname = NULL;
1548     }
1549 
1550     if (dump_gstats(args, 'C', "COMBINED", filec, amps, nref) < 0)
1551         goto err;
1552 
1553     ret = 0;
1554  err:
1555     bam_destroy1(b);
1556     if (ret) {
1557         if (header)
1558             sam_hdr_destroy(header);
1559         if (fp)
1560             sam_close(fp);
1561     }
1562     for (i = 0; i < nref; i++) {
1563         stats_free(amps[i].lstats);
1564         stats_free(amps[i].gstats);
1565         free(amps[i].amp);
1566     }
1567     free(amps);
1568     free(pos2start);
1569     free(pos2end);
1570     if (ret) {
1571         if (sname && sname != sname_)
1572             free(sname);
1573     }
1574 
1575     return ret;
1576 }
1577 
usage(astats_args_t * args,FILE * fp,int exit_status)1578 static int usage(astats_args_t *args, FILE *fp, int exit_status) {
1579     fprintf(fp,
1580 "\n"
1581 "Usage: samtools ampliconstats [options] primers.bed *.bam > astats.txt\n"
1582 "\n"
1583 "Options:\n");
1584     fprintf(fp, "  -f, --required-flag STR|INT\n"
1585             "               Only include reads with all of the FLAGs present [0x%X]\n",args->flag_require);
1586     fprintf(fp, "  -F, --filter-flag STR|INT\n"
1587             "               Only include reads with none of the FLAGs present [0x%X]\n",args->flag_filter & 0xffff);
1588     fprintf(fp, "  -a, --max-amplicons INT\n"
1589             "               Change the maximum number of amplicons permitted [%d]\n", MAX_AMP);
1590     fprintf(fp, "  -l, --max-amplicon-length INT\n"
1591             "               Change the maximum length of an individual amplicon [%d]\n", MAX_AMP_LEN);
1592     fprintf(fp, "  -d, --min-depth INT[,INT]...\n"
1593             "               Minimum base depth(s) to consider position covered [%d]\n", args->min_depth[0]);
1594     fprintf(fp, "  -m, --pos-margin INT\n"
1595             "               Margin of error for matching primer positions [%d]\n", args->max_delta);
1596     fprintf(fp, "  -o, --output FILE\n"
1597             "               Specify output file [stdout if unset]\n");
1598     fprintf(fp, "  -s, --use-sample-name\n"
1599             "               Use the sample name from the first @RG header line\n");
1600     fprintf(fp, "  -t, --tlen-adjust INT\n"
1601             "               Add/subtract from TLEN; use when clipping but no fixmate step\n");
1602     fprintf(fp, "  -b, --tcoord-bin INT\n"
1603             "               Bin template start,end positions into multiples of INT[1]\n");
1604     fprintf(fp, "  -c, --tcoord-min-count INT\n"
1605             "               Minimum template start,end frequency for recording [%d]\n", TCOORD_MIN_COUNT);
1606     fprintf(fp, "  -D, --depth-bin FRACTION\n"
1607             "               Merge FDP values within +/- FRACTION together\n");
1608     fprintf(fp, "  -S, --single-ref\n"
1609             "               Force single-ref (<=1.12) output format\n");
1610     sam_global_opt_help(fp, "I.--.@");
1611 
1612     return exit_status;
1613 }
1614 
main_ampliconstats(int argc,char ** argv)1615 int main_ampliconstats(int argc, char **argv) {
1616     astats_args_t args = {
1617         .ga = SAM_GLOBAL_ARGS_INIT,
1618         .flag_require = 0,
1619         .flag_filter = 0x10B04,
1620         //.sites = BED_LIST_INIT,
1621         .max_delta = 30, // large enough to cope with alt primers
1622         .min_depth = {1},
1623         .use_sample_name = 0,
1624         .max_amp = MAX_AMP,
1625         .max_amp_len = MAX_AMP_LEN,
1626         .tlen_adj = 0,
1627         .out_fp = stdout,
1628         .tcoord_min_count = TCOORD_MIN_COUNT,
1629         .tcoord_bin = 1,
1630         .depth_bin = 0.01,
1631         .multi_ref = 1
1632     }, oargs = args;
1633 
1634     static const struct option loptions[] =
1635     {
1636         SAM_OPT_GLOBAL_OPTIONS('I', 0, '-', '-', 0, '@'),
1637         {"help", no_argument, NULL, 'h'},
1638         {"flag-require", required_argument, NULL, 'f'},
1639         {"flag-filter", required_argument, NULL, 'F'},
1640         {"min-depth", required_argument, NULL, 'd'},
1641         {"output", required_argument, NULL, 'o'},
1642         {"pos-margin", required_argument, NULL, 'm'},
1643         {"use-sample-name", no_argument, NULL, 's'},
1644         {"max-amplicons", required_argument, NULL, 'a'},
1645         {"max-amplicon-length", required_argument, NULL, 'l'},
1646         {"tlen-adjust", required_argument, NULL, 't'},
1647         {"tcoord-min-count", required_argument, NULL, 'c'},
1648         {"tcoord-bin", required_argument, NULL, 'b'},
1649         {"depth-bin", required_argument, NULL, 'D'},
1650         {"single-ref", no_argument, NULL, 'S'},
1651         {NULL, 0, NULL, 0}
1652     };
1653     int opt;
1654 
1655     while ( (opt=getopt_long(argc,argv,"?hf:F:@:p:m:d:sa:l:t:o:c:b:D:S",loptions,NULL))>0 ) {
1656         switch (opt) {
1657         case 'f': args.flag_require = bam_str2flag(optarg); break;
1658         case 'F':
1659             if (args.flag_filter & 0x10000)
1660                 args.flag_filter = 0; // strip default on first -F usage
1661             args.flag_filter |= bam_str2flag(optarg); break;
1662 
1663         case 'm': args.max_delta = atoi(optarg); break; // margin
1664         case 'D': args.depth_bin = atof(optarg); break; // depth bin fraction
1665         case 'd': {
1666             int d = 0;
1667             char *cp = optarg, *ep;
1668             do {
1669                 long n = strtol(cp, &ep, 10);
1670                 args.min_depth[d++] = n;
1671                 if (*ep != ',')
1672                     break;
1673                 cp = ep+1;
1674             } while (d < MAX_DEPTH);
1675             break;
1676         }
1677 
1678         case 'a': args.max_amp = atoi(optarg)+1;break;
1679         case 'l': args.max_amp_len = atoi(optarg)+1;break;
1680 
1681         case 'c': args.tcoord_min_count = atoi(optarg);break;
1682         case 'b':
1683             args.tcoord_bin = atoi(optarg);
1684             if (args.tcoord_bin < 1)
1685                 args.tcoord_bin = 1;
1686             break;
1687 
1688         case 't': args.tlen_adj = atoi(optarg);break;
1689 
1690         case 's': args.use_sample_name = 1;break;
1691 
1692         case 'o':
1693             if (!(args.out_fp = fopen(optarg, "w"))) {
1694                 perror(optarg);
1695                 return 1;
1696             }
1697             break;
1698 
1699         case 'S':
1700             args.multi_ref = 0;
1701             break;
1702 
1703         case '?': return usage(&oargs, stderr, EXIT_FAILURE);
1704         case 'h': return usage(&oargs, stdout, EXIT_SUCCESS);
1705 
1706         default:
1707             if (parse_sam_global_opt(opt, optarg, loptions, &args.ga) != 0)
1708                 usage(&oargs,stderr, EXIT_FAILURE);
1709             break;
1710         }
1711     }
1712 
1713     if (argc <= optind)
1714         return usage(&oargs, stdout, EXIT_SUCCESS);
1715     if (argc <= optind+1 && isatty(STDIN_FILENO))
1716         return usage(&oargs, stderr, EXIT_FAILURE);
1717 
1718     khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
1719     if (load_bed_file_multi_ref(argv[optind], 1, 0, bed_hash)) {
1720         print_error_errno("ampliconstats",
1721                           "Could not read file \"%s\"", argv[optind]);
1722         return 1;
1723 
1724     }
1725 
1726     khiter_t k, ref_count = 0;
1727     for (k = kh_begin(bed_hash); k != kh_end(bed_hash); k++) {
1728         if (!kh_exist(bed_hash, k))
1729             continue;
1730         ref_count++;
1731     }
1732     if (ref_count == 0)
1733         return 1;
1734     if (ref_count > 1 && args.multi_ref == 0) {
1735         print_error("ampliconstats",
1736                     "Single-ref mode is not permitted for BED files\n"
1737                     "containing more than one reference.");
1738         return 1;
1739     }
1740 
1741     args.argv = stringify_argv(argc, argv);
1742     int ret;
1743     if (argc == ++optind) {
1744         char *av = "-";
1745         ret = amplicon_stats(&args, bed_hash, &av, 1);
1746     } else {
1747         ret = amplicon_stats(&args, bed_hash, &argv[optind], argc-optind);
1748     }
1749 
1750     free(args.argv);
1751     destroy_bed_hash(bed_hash);
1752 
1753     return ret;
1754 }
1755