1 #include "samtools.pysam.h"
2 
3 /*  stats.c -- This is the former bamcheck integrated into samtools/htslib.
4 
5     Copyright (C) 2020-2021 Genome Research Ltd.
6 
7     Author: James Bonfield <jkb@sanger.ac.uk>
8 
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15 
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18 
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 DEALINGS IN THE SOFTWARE.  */
26 
27 /*
28  * This tool is designed to give "samtools stats" style output, but dedicated
29  * to small amplicon sequencing projects.  It gathers stats on the
30  * distribution of reads across amplicons.
31  */
32 
33 /*
34  * TODO:
35  * - Cope with multiple references.  What do we do here?  Just request one?
36  * - Permit regions rather than consuming whole file (maybe solves above).
37  */
38 
39 #include <config.h>
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <string.h>
43 #include <inttypes.h>
44 #include <getopt.h>
45 #include <unistd.h>
46 #include <math.h>
47 
48 #include <htslib/sam.h>
49 #include <htslib/khash.h>
50 
51 #include "samtools.h"
52 #include "sam_opts.h"
53 #include "bam_ampliconclip.h"
54 
55 KHASH_MAP_INIT_INT64(tcoord, int64_t)
56 KHASH_MAP_INIT_STR(qname, int64_t)
57 
58 #ifndef MIN
59 #define MIN(a,b) ((a)<(b)?(a):(b))
60 #endif
61 
62 #ifndef MAX
63 #define MAX(a,b) ((a)>(b)?(a):(b))
64 #endif
65 
66 #ifndef ABS
67 #define ABS(a) ((a)>=0?(a):-(a))
68 #endif
69 
70 #define TCOORD_MIN_COUNT   10
71 #define MAX_AMP 1000       // Default maximum number of amplicons
72 #define MAX_AMP_LEN 1000   // Default maximum length of any single amplicon
73 #define MAX_PRIMER_PER_AMPLICON 4  // Max primers per LEFT/RIGHT
74 #define MAX_DEPTH 5        // Number of different depths permitted
75 
76 typedef struct {
77     sam_global_args ga;
78     uint32_t flag_require;
79     uint32_t flag_filter;
80     int max_delta;   // Used for matching read to amplicon primer loc
81     int min_depth[MAX_DEPTH]; // Used for coverage; must be >= min_depth deep
82     int use_sample_name;
83     int max_amp;     // Total number of amplicons
84     int max_amp_len; // Maximum length of an individual amplicon
85     double depth_bin;// aggregate depth within this fraction
86     int tlen_adj;    // Adjust tlen by this amount, due to clip but no fixmate
87     FILE *out_fp;
88     char *argv;
89     int tcoord_min_count;
90     int tcoord_bin;
91     int multi_ref;
92 } astats_args_t;
93 
94 typedef struct {
95     int nseq;       // total sequence count
96     int nfiltered;  // sequence filtered
97     int nfailprimer;// count of sequences not matching the primer locations
98 
99     // Sizes of memory allocated below, to permit reset
100     int max_amp, max_amp_len, max_len;
101 
102     // Summary across all samples, sum(x) plus sum(x^2) for s.d. calc
103     int64_t *nreads, *nreads2;          // [max_amp]
104     double  *nfull_reads;               // [max_amp]; 0.5/read if paired.
105     double  *nrperc, *nrperc2;          // [max_amp]
106     int64_t *nbases, *nbases2;          // [max_amp]
107     int64_t *coverage;                  // [max_amp][max_amp_len]
108     double  (*covered_perc)[MAX_DEPTH]; // [max_amp][MAX_DEPTH]
109     double  (*covered_perc2)[MAX_DEPTH];// [max_amp][MAX_DEPTH];
110     khash_t(tcoord) **tcoord;           // [max_amp+1]
111 
112     // 0 is correct pair, 1 is incorrect pair, 2 is unidentified
113     int     (*amp_dist)[3];             // [MAX_AMP][3];
114 
115     int *depth_valid; // [max_len]
116     int *depth_all;   // [max_len]
117     khash_t(qname) *qend;  // queryname end, for overlap removal
118 } astats_t;
119 
120 // We can have multiple primers for LEFT / RIGHT, so this
121 // permits detection by any compatible combination.
122 // One reference:
123 typedef struct {
124     int64_t left[MAX_PRIMER_PER_AMPLICON];
125     int nleft;
126     int64_t right[MAX_PRIMER_PER_AMPLICON];
127     int nright;
128     int64_t max_left, min_right; // inner dimensions
129     int64_t min_left, max_right; // outer dimensions
130 } amplicon_t;
131 
132 // Multiple references, we have an array of amplicons_t - one per used ref.
133 // We have per reference local and global stats here, as some of the stats
134 // are coordinate based.  However we report them combined together as a single
135 // list across all references.
136 // "namp" is the number of amplicons in this reference, but they're
137 // numbered first_amp to first_amp+namp-1 inclusively.
138 typedef struct {
139     int tid, namp;
140     int64_t len;
141     bed_entry_list_t *sites;
142     amplicon_t *amp;
143     astats_t *lstats, *gstats; // local (1 file) and global (all file) stats
144     const char *ref;           // ref name (pointer to the bed hash table key)
145     int first_amp;             // first amplicon number for this ref
146 } amplicons_t;
147 
148 // Reinitialised for each new reference/chromosome.
149 // Counts from 1 to namp, -1 for no match and 0 for ?.
150 static int *pos2start = NULL;
151 static int *pos2end = NULL;
152 static int pos2size = 0; // allocated size of pos2start/end
153 
154 // Lookup table to go from position to amplicon based on
155 // read start / end.
initialise_amp_pos_lookup(astats_args_t * args,amplicons_t * amps,int ref)156 static int initialise_amp_pos_lookup(astats_args_t *args,
157                                      amplicons_t *amps,
158                                      int ref) {
159     int64_t i, j;
160     amplicon_t *amp = amps[ref].amp;
161     int64_t max_len = amps[ref].len;
162     int namp = amps[ref].namp;
163 
164     if (max_len+1 > pos2size) {
165         if (!(pos2start = realloc(pos2start, (max_len+1)*sizeof(*pos2start))))
166             return -1;
167         if (!(pos2end   = realloc(pos2end,   (max_len+1)*sizeof(*pos2end))))
168             return -1;
169         pos2size = max_len;
170     }
171     for (i = 0; i < max_len; i++)
172         pos2start[i] = pos2end[i] = -1;
173 
174     for (i = 0; i < namp; i++) {
175         for (j = 0; j < amp[i].nleft; j++) {
176             int64_t p;
177             for (p = amp[i].left[j] - args->max_delta;
178                  p <= amp[i].left[j] + args->max_delta; p++) {
179                 if (p < 1 || p > max_len)
180                     continue;
181                 pos2start[p-1] = i;
182             }
183         }
184         for (j = 0; j < amp[i].nright; j++) {
185             int64_t p;
186             for (p = amp[i].right[j] - args->max_delta;
187                  p <= amp[i].right[j] + args->max_delta; p++) {
188                 if (p < 1 || p > max_len)
189                     continue;
190                 pos2end[p-1] = i;
191             }
192         }
193     }
194 
195     return 0;
196 }
197 
198 // Counts amplicons.
199 // Assumption: input BED file alternates between LEFT and RIGHT primers
200 // per amplicon, thus we can count the number based on the switching
201 // orientation.
count_amplicon(bed_entry_list_t * sites)202 static int count_amplicon(bed_entry_list_t *sites) {
203     int i, namp, last_rev = 0;
204     for (i = namp = 0; i < sites->length; i++) {
205         if (sites->bp[i].rev == 0 && last_rev)
206             namp++;
207         last_rev = sites->bp[i].rev;
208     }
209 
210     return ++namp;
211 }
212 
213 // We're only interest in the internal part of the amplicon.
214 // Our bed file has LEFT start/end followed by RIGHT start/end,
215 // so collapse these to LEFT end / RIGHT start.
216 //
217 // Returns right most amplicon position on success,
218 //         < 0 on error
bed2amplicon(astats_args_t * args,bed_entry_list_t * sites,amplicon_t * amp,int * namp,int do_title,const char * ref,int first_amp)219 static int64_t bed2amplicon(astats_args_t *args, bed_entry_list_t *sites,
220                             amplicon_t *amp, int *namp, int do_title,
221                             const char *ref, int first_amp) {
222     int i, j;
223     int64_t max_right = 0;
224     FILE *ofp = args->out_fp;
225 
226     *namp = 0;
227 
228     // Assume all primers for the same amplicon are adjacent in BED
229     // with all + followed by all -.  Thus - to + signifies next primer set.
230     int last_rev = 0;
231     amp[0].max_left = 0;
232     amp[0].min_right = INT64_MAX;
233     amp[0].min_left = INT64_MAX;
234     amp[0].max_right = 0;
235     if (do_title) {
236         fprintf(ofp, "# Amplicon locations from BED file.\n");
237         fprintf(ofp, "# LEFT/RIGHT are <start>-<end> format and "
238                 "comma-separated for alt-primers.\n");
239         if (args->multi_ref)
240             fprintf(ofp, "#\n# AMPLICON\tREF\tNUMBER\tLEFT\tRIGHT\n");
241         else
242             fprintf(ofp, "#\n# AMPLICON\tNUMBER\tLEFT\tRIGHT\n");
243     }
244     for (i = j = 0; i < sites->length; i++) {
245         if (i == 0 && sites->bp[i].rev != 0) {
246             fprintf(samtools_stderr, "[ampliconstats] error: BED file should start"
247                     " with the + strand primer\n");
248             return -1;
249         }
250         if (sites->bp[i].rev == 0 && last_rev) {
251             j++;
252             if (j >= args->max_amp) {
253                 fprintf(samtools_stderr, "[ampliconstats] error: too many amplicons"
254                         " (%d). Use -a option to raise this.\n", j);
255                 return -1;
256             }
257             amp[j].max_left = 0;
258             amp[j].min_right = INT64_MAX;
259             amp[j].min_left = INT64_MAX;
260             amp[j].max_right = 0;
261         }
262         if (sites->bp[i].rev == 0) {
263             if (i == 0 || last_rev) {
264                 if (j>0) fprintf(ofp, "\n");
265                 if (args->multi_ref)
266                     fprintf(ofp, "AMPLICON\t%s\t%d", ref, j+1 + first_amp);
267                 else
268                     fprintf(ofp, "AMPLICON\t%d", j+1);
269             }
270             if (amp[j].nleft >= MAX_PRIMER_PER_AMPLICON) {
271                 print_error_errno("ampliconstats",
272                                   "too many primers per amplicon (%d).\n",
273                                   MAX_PRIMER_PER_AMPLICON);
274                 return -1;
275             }
276             amp[j].left[amp[j].nleft++] = sites->bp[i].right;
277             if (amp[j].max_left < sites->bp[i].right+1)
278                 amp[j].max_left = sites->bp[i].right+1;
279             if (amp[j].min_left > sites->bp[i].right+1)
280                 amp[j].min_left = sites->bp[i].right+1;
281             // BED file, so left+1 as zero based. right(+1-1) as
282             // BED goes one beyond end (and we want inclusive range).
283             fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nleft > 1],
284                     sites->bp[i].left+1, sites->bp[i].right);
285         } else {
286             if (amp[j].nright >= MAX_PRIMER_PER_AMPLICON) {
287                 print_error_errno("ampliconstats",
288                                   "too many primers per amplicon (%d)",
289                                   MAX_PRIMER_PER_AMPLICON);
290                 return -1;
291             }
292             amp[j].right[amp[j].nright++] = sites->bp[i].left;
293             if (amp[j].min_right > sites->bp[i].left-1)
294                 amp[j].min_right = sites->bp[i].left-1;
295             if (amp[j].max_right < sites->bp[i].left-1) {
296                 amp[j].max_right = sites->bp[i].left-1;
297                 if (amp[j].max_right - amp[j].min_left + 1 >=
298                     args->max_amp_len) {
299                     fprintf(samtools_stderr, "[ampliconstats] error: amplicon "
300                             "longer (%d) than max_amp_len option (%d)\n",
301                             (int)(amp[j].max_right - amp[j].min_left + 2),
302                             args->max_amp_len);
303                     return -1;
304                 }
305                 if (max_right < amp[j].max_right)
306                     max_right = amp[j].max_right;
307             }
308             fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nright > 1],
309                     sites->bp[i].left+1, sites->bp[i].right);
310         }
311         last_rev = sites->bp[i].rev;
312     }
313     if (last_rev != 1) {
314         fprintf(ofp, "\n"); // useful if going to samtools_stdout
315         fprintf(samtools_stderr, "[ampliconstats] error: bed file does not end on"
316                 " a reverse strand primer.\n");
317         return -1;
318     }
319     *namp = ++j;
320     if (j) fprintf(ofp, "\n");
321 
322     if (j >= args->max_amp) {
323         fprintf(samtools_stderr, "[ampliconstats] error: "
324                 "too many amplicons (%d). Use -a option to raise this.", j);
325         return -1;
326     }
327 
328 //    for (i = 0; i < *namp; i++) {
329 //      fprintf(samtools_stdout, "%d\t%ld", i, amp[i].length);
330 //      for (j = 0; j < amp[i].nleft; j++)
331 //          fprintf(samtools_stdout, "%c%ld", "\t,"[j>0], amp[i].left[j]);
332 //      for (j = 0; j < amp[i].nright; j++)
333 //          fprintf(samtools_stdout, "%c%ld", "\t,"[j>0], amp[i].right[j]);
334 //      fprintf(samtools_stdout, "\n");
335 //    }
336 
337     return max_right;
338 }
339 
stats_free(astats_t * st)340 void stats_free(astats_t *st) {
341     if (!st)
342         return;
343 
344     free(st->nreads);
345     free(st->nreads2);
346     free(st->nfull_reads);
347     free(st->nrperc);
348     free(st->nrperc2);
349     free(st->nbases);
350     free(st->nbases2);
351     free(st->coverage);
352     free(st->covered_perc);
353     free(st->covered_perc2);
354     free(st->amp_dist);
355 
356     free(st->depth_valid);
357     free(st->depth_all);
358 
359     if (st->tcoord) {
360         int i;
361         for (i = 0; i <= st->max_amp; i++) {
362             if (st->tcoord[i])
363                 kh_destroy(tcoord, st->tcoord[i]);
364         }
365         free(st->tcoord);
366     }
367 
368     khiter_t k;
369     for (k = kh_begin(st->qend); k != kh_end(st->qend); k++)
370         if (kh_exist(st->qend, k))
371             free((void *)kh_key(st->qend, k));
372     kh_destroy(qname, st->qend);
373 
374     free(st);
375 }
376 
stats_alloc(int64_t max_len,int max_amp,int max_amp_len)377 astats_t *stats_alloc(int64_t max_len, int max_amp, int max_amp_len) {
378     astats_t *st = calloc(1, sizeof(*st));
379     if (!st)
380         return NULL;
381 
382     st->max_amp = max_amp;
383     st->max_amp_len = max_amp_len;
384     st->max_len = max_len;
385 
386     if (!(st->nreads  = calloc(max_amp, sizeof(*st->nreads))))  goto err;
387     if (!(st->nreads2 = calloc(max_amp, sizeof(*st->nreads2)))) goto err;
388     if (!(st->nrperc  = calloc(max_amp, sizeof(*st->nrperc))))  goto err;
389     if (!(st->nrperc2 = calloc(max_amp, sizeof(*st->nrperc2)))) goto err;
390     if (!(st->nbases  = calloc(max_amp, sizeof(*st->nbases))))  goto err;
391     if (!(st->nbases2 = calloc(max_amp, sizeof(*st->nbases2)))) goto err;
392 
393     if (!(st->nfull_reads = calloc(max_amp, sizeof(*st->nfull_reads))))
394         goto err;
395 
396     if (!(st->coverage = calloc(max_amp*max_amp_len, sizeof(*st->coverage))))
397         goto err;
398 
399     if (!(st->covered_perc  = calloc(max_amp, sizeof(*st->covered_perc))))
400         goto err;
401     if (!(st->covered_perc2 = calloc(max_amp, sizeof(*st->covered_perc2))))
402         goto err;
403 
404     if (!(st->tcoord = calloc(max_amp+1, sizeof(*st->tcoord)))) goto err;
405     int i;
406     for (i = 0; i <= st->max_amp; i++)
407         if (!(st->tcoord[i] = kh_init(tcoord)))
408             goto err;
409 
410     if (!(st->qend = kh_init(qname)))
411         goto err;
412 
413     if (!(st->depth_valid = calloc(max_len, sizeof(*st->depth_valid))))
414         goto err;
415     if (!(st->depth_all   = calloc(max_len, sizeof(*st->depth_all))))
416         goto err;
417 
418     if (!(st->amp_dist  = calloc(max_amp, sizeof(*st->amp_dist))))  goto err;
419 
420     return st;
421 
422  err:
423     stats_free(st);
424     return NULL;
425 }
426 
stats_reset(astats_t * st)427 static void stats_reset(astats_t *st) {
428     st->nseq = 0;
429     st->nfiltered = 0;
430     st->nfailprimer = 0;
431 
432     memset(st->nreads,  0, st->max_amp * sizeof(*st->nreads));
433     memset(st->nreads2, 0, st->max_amp * sizeof(*st->nreads2));
434     memset(st->nfull_reads, 0, st->max_amp * sizeof(*st->nfull_reads));
435 
436     memset(st->nrperc,  0, st->max_amp * sizeof(*st->nrperc));
437     memset(st->nrperc2, 0, st->max_amp * sizeof(*st->nrperc2));
438 
439     memset(st->nbases,  0, st->max_amp * sizeof(*st->nbases));
440     memset(st->nbases2, 0, st->max_amp * sizeof(*st->nbases2));
441 
442     memset(st->coverage, 0, st->max_amp * st->max_amp_len
443            * sizeof(*st->coverage));
444     memset(st->covered_perc,  0, st->max_amp * sizeof(*st->covered_perc));
445     memset(st->covered_perc2, 0, st->max_amp * sizeof(*st->covered_perc2));
446 
447     // Keep the allocated entries as it's likely all files will share
448     // the same keys.  Instead we reset counters to zero for common ones
449     // and delete rare ones.
450     int i;
451     for (i = 0; i <= st->max_amp; i++) {
452         khiter_t k;
453         for (k = kh_begin(st->tcoord[i]);
454              k != kh_end(st->tcoord[i]); k++)
455             if (kh_exist(st->tcoord[i], k)) {
456                 if (kh_value(st->tcoord[i], k) < 5)
457                     kh_del(tcoord, st->tcoord[i], k);
458                 else
459                     kh_value(st->tcoord[i], k) = 0;
460             }
461     }
462 
463     khiter_t k;
464     for (k = kh_begin(st->qend); k != kh_end(st->qend); k++)
465         if (kh_exist(st->qend, k))
466             free((void *)kh_key(st->qend, k));
467     kh_clear(qname, st->qend);
468 
469     memset(st->depth_valid, 0, st->max_len * sizeof(*st->depth_valid));
470     memset(st->depth_all,   0, st->max_len * sizeof(*st->depth_all));
471     memset(st->amp_dist,  0, st->max_amp * sizeof(*st->amp_dist));
472 }
473 
amp_stats_reset(amplicons_t * amps,int nref)474 static void amp_stats_reset(amplicons_t *amps, int nref) {
475     int i;
476     for (i = 0; i < nref; i++) {
477         if (!amps[i].sites)
478             continue;
479         stats_reset(amps[i].lstats);
480     }
481 }
482 
accumulate_stats(astats_args_t * args,amplicons_t * amps,bam1_t * b)483 static int accumulate_stats(astats_args_t *args, amplicons_t *amps,
484                             bam1_t *b) {
485     int ref = b->core.tid;
486     amplicon_t *amp = amps[ref].amp;
487     astats_t *stats = amps[ref].lstats;
488     int len = amps[ref].len;
489 
490     if (!stats)
491         return 0;
492 
493     stats->nseq++;
494     if ((b->core.flag & args->flag_require) != args->flag_require ||
495         (b->core.flag & args->flag_filter)  != 0) {
496         stats->nfiltered++;
497         return 0;
498     }
499 
500     int64_t start = b->core.pos, mstart = start; // modified start
501     int64_t end = bam_endpos(b), i;
502 
503     // Compute all-template-depth and valid-template-depth.
504     // We track current end location per read name so we can remove overlaps.
505     // Potentially we could use this data for a better amplicon-depth
506     // count too, but for now it's purely for the per-base plots.
507     int ret;
508     khiter_t k;
509     int prev_start = 0, prev_end = 0;
510     if ((b->core.flag & BAM_FPAIRED)
511         && !(b->core.flag & (BAM_FSUPPLEMENTARY | BAM_FSECONDARY))) {
512         k = kh_put(qname, stats->qend, bam_get_qname(b), &ret);
513         if (ret == 0) {
514             prev_start = kh_value(stats->qend, k) & 0xffffffff;
515             prev_end = kh_value(stats->qend, k)>>32;
516             mstart = MAX(mstart, prev_end);
517             // Ideally we'd reuse strings so we don't thrash free/malloc.
518             // However let's see if the official way of doing that (malloc
519             // itself) is fast enough first.
520             free((void *)kh_key(stats->qend, k));
521             kh_del(qname, stats->qend, k);
522             //fprintf(samtools_stderr, "remove overlap %d to %d\n", (int)start, (int)mstart);
523         } else {
524             if (!(kh_key(stats->qend, k) = strdup(bam_get_qname(b))))
525                 return -1;
526 
527             kh_value(stats->qend, k) = start | (end << 32);
528         }
529     }
530     for (i = mstart; i < end && i < len; i++)
531         stats->depth_all[i]++;
532     if (i < end) {
533         print_error("ampliconstats", "record %s overhangs end of reference",
534                     bam_get_qname(b));
535         // But keep going, as it's harmless.
536     }
537 
538     // On single ended runs, eg ONT or PacBio, we just use the start/end
539     // of the template to assign.
540     int anum = (b->core.flag & BAM_FREVERSE) || !(b->core.flag & BAM_FPAIRED)
541         ? (end-1 >= 0 && end-1 < len ? pos2end[end-1] : -1)
542         : (start >= 0 && start < len ? pos2start[start] : -1);
543 
544     // ivar sometimes soft-clips 100% of the bases.
545     // This is essentially unmapped
546     if (end == start && (args->flag_filter & BAM_FUNMAP)) {
547         stats->nfiltered++;
548         return 0;
549     }
550 
551     if (anum == -1)
552         stats->nfailprimer++;
553 
554     if (anum >= 0) {
555         int64_t c = MIN(end,amp[anum].min_right+1) - MAX(start,amp[anum].max_left);
556         if (c > 0) {
557             stats->nreads[anum]++;
558             // NB: ref bases rather than read bases
559             stats->nbases[anum] += c;
560 
561             int64_t i;
562             if (start < 0) start = 0;
563             if (end > len) end = len;
564 
565             int64_t ostart = MAX(start, amp[anum].min_left-1);
566             int64_t oend = MIN(end, amp[anum].max_right);
567             int64_t offset = amp[anum].min_left-1;
568             for (i = ostart; i < oend; i++)
569                 stats->coverage[anum*stats->max_amp_len + i-offset]++;
570         } else {
571             stats->nfailprimer++;
572         }
573     }
574 
575     // Template length in terms of amplicon number to amplicon number.
576     // We expect left to right of same amplicon (len 0), but it may go
577     // to next amplicon (len 1) or prev (len -1), etc.
578     int64_t t_end;
579     int oth_anum = -1;
580 
581     if (b->core.flag & BAM_FPAIRED) {
582         t_end = (b->core.flag & BAM_FREVERSE ? end : start)
583             + b->core.isize;
584 
585         // If we've clipped the primers but not followed up with a fixmates
586         // then our start+TLEN will take us to a location which is
587         // length(LEFT_PRIMER) + length(RIGHT_PRIMER) too far away.
588         //
589         // The correct solution is to run samtools fixmate so TLEN is correct.
590         // The hacky solution is to fudge the expected tlen by double the
591         // average primer length (e.g. 50).
592         t_end += b->core.isize > 0 ? -args->tlen_adj : +args->tlen_adj;
593 
594         if (t_end > 0 && t_end < len && b->core.isize != 0)
595             oth_anum = (b->core.flag & BAM_FREVERSE)
596                 ? pos2start[t_end]
597                 : pos2end[t_end];
598     } else {
599         // Not paired (see int anum = (REV || !PAIR) ?en :st expr above)
600         oth_anum = pos2start[start];
601         t_end = end;
602     }
603 
604     // We don't want to count our pairs twice.
605     // If both left/right are known, count it on left only.
606     // If only one is known, we'll only get to this code once
607     // so we can also count it.
608     int astatus = 2;
609     if (anum != -1 && oth_anum != -1) {
610         astatus = oth_anum == anum ? 0 : 1;
611         if (start <= t_end)
612             stats->amp_dist[anum][astatus]++;
613     } else if (anum >= 0) {
614         stats->amp_dist[anum][astatus = 2]++;
615     }
616 
617     if (astatus == 0 && !(b->core.flag & (BAM_FUNMAP | BAM_FMUNMAP))) {
618         if (prev_end && mstart > prev_end) {
619             // 2nd read with gap to 1st; undo previous increment.
620             for (i = prev_start; i < prev_end; i++)
621                 stats->depth_valid[i]--;
622             stats->nfull_reads[anum] -= (b->core.flag & BAM_FPAIRED) ? 0.5 : 1;
623         } else {
624             // 1st read, or 2nd read that overlaps 1st
625             for (i = mstart; i < end; i++)
626                 stats->depth_valid[i]++;
627             stats->nfull_reads[anum] += (b->core.flag & BAM_FPAIRED) ? 0.5 : 1;
628         }
629     }
630 
631     // Track template start,end frequencies, so we can give stats on
632     // amplicon primer usage.
633     if ((b->core.flag & BAM_FPAIRED) && b->core.isize <= 0)
634         // left to right only, so we don't double count template positions.
635         return 0;
636 
637     start = b->core.pos;
638     t_end = b->core.flag & BAM_FPAIRED
639         ? start + b->core.isize-1
640         : end;
641     uint64_t tcoord = MIN(start+1, UINT32_MAX) | (MIN(t_end+1, UINT32_MAX)<<32);
642     k = kh_put(tcoord, stats->tcoord[anum+1], tcoord, &ret);
643     if (ret < 0)
644         return -1;
645     if (ret == 0)
646         kh_value(stats->tcoord[anum+1], k)++;
647     else
648         kh_value(stats->tcoord[anum+1], k)=1;
649     kh_value(stats->tcoord[anum+1], k) |= ((int64_t)astatus<<32);
650 
651     return 0;
652 }
653 
654 // Append file local stats to global stats
append_lstats(astats_t * lstats,astats_t * gstats,int namp,int all_nseq)655 int append_lstats(astats_t *lstats, astats_t *gstats, int namp, int all_nseq) {
656     gstats->nseq += lstats->nseq;
657     gstats->nfiltered += lstats->nfiltered;
658     gstats->nfailprimer += lstats->nfailprimer;
659 
660     int a;
661     for (a = -1; a < namp; a++) {
662         // Add khash local (kl) to khash global (kg)
663         khiter_t kl, kg;
664         for (kl = kh_begin(lstats->tcoord[a+1]);
665              kl != kh_end(lstats->tcoord[a+1]); kl++) {
666             if (!kh_exist(lstats->tcoord[a+1], kl) ||
667                 kh_value(lstats->tcoord[a+1], kl) == 0)
668                 continue;
669 
670             int ret;
671             kg = kh_put(tcoord, gstats->tcoord[a+1],
672                         kh_key(lstats->tcoord[a+1], kl),
673                         &ret);
674             if (ret < 0)
675                 return -1;
676 
677             kh_value(gstats->tcoord[a+1], kg) =
678                 (ret == 0
679                  ? (kh_value(gstats->tcoord[a+1], kg) & 0xFFFFFFFF)
680                  : 0)
681                 + kh_value(lstats->tcoord[a+1], kl);
682         }
683         if (a == -1) continue;
684 
685         gstats->nreads[a]  += lstats->nreads[a];
686         gstats->nreads2[a] += lstats->nreads[a] * lstats->nreads[a];
687         gstats->nfull_reads[a] += lstats->nfull_reads[a];
688 
689         // To get mean & sd for amplicon read percentage, we need
690         // to do the divisions here as nseq differs for each sample.
691         double nrperc = all_nseq ? 100.0 * lstats->nreads[a] / all_nseq : 0;
692         gstats->nrperc[a]  += nrperc;
693         gstats->nrperc2[a] += nrperc*nrperc;
694 
695         gstats->nbases[a]  += lstats->nbases[a];
696         gstats->nbases2[a] += lstats->nbases[a] * lstats->nbases[a];
697 
698         int d;
699         for (d = 0; d < MAX_DEPTH; d++) {
700             gstats->covered_perc[a][d]  += lstats->covered_perc[a][d];
701             gstats->covered_perc2[a][d] += lstats->covered_perc[a][d]
702                                          * lstats->covered_perc[a][d];
703         }
704 
705         for (d = 0; d < 3; d++)
706             gstats->amp_dist[a][d] += lstats->amp_dist[a][d];
707     }
708 
709     for (a = 0; a < lstats->max_len; a++) {
710         gstats->depth_valid[a] += lstats->depth_valid[a];
711         gstats->depth_all[a]   += lstats->depth_all[a];
712     }
713 
714     return 0;
715 }
716 
append_stats(amplicons_t * amps,int nref)717 int append_stats(amplicons_t *amps, int nref) {
718     int i, r, all_nseq = 0;
719     for (r = 0; r < nref; r++) {
720         if (!amps[r].sites)
721             continue;
722         astats_t *stats = amps[r].lstats;
723         all_nseq  += stats->nseq - stats->nfiltered - stats->nfailprimer;
724     }
725 
726     for (i = 0; i < nref; i++) {
727         if (!amps[i].sites)
728             continue;
729         if (append_lstats(amps[i].lstats, amps[i].gstats, amps[i].namp,
730                           all_nseq) < 0)
731             return -1;
732     }
733 
734     return 0;
735 }
736 
737 typedef struct {
738     int32_t start, end;
739     uint32_t freq;
740     uint32_t status;
741 } tcoord_t;
742 
743 // Sort tcoord by descending frequency and then ascending start and  end.
tcoord_freq_sort(const void * vp1,const void * vp2)744 static int tcoord_freq_sort(const void *vp1, const void *vp2) {
745     const tcoord_t *t1 = (const tcoord_t *)vp1;
746     const tcoord_t *t2 = (const tcoord_t *)vp2;
747 
748     if (t1->freq != t2->freq)
749         return t2->freq - t1->freq;
750 
751     if (t1->start != t2->start)
752         return t1->start - t2->start;
753 
754     return t1->end - t2->end;
755 }
756 
757 
758 /*
759  * Merges tcoord start,end,freq,status tuples if their coordinates are
760  * close together.  We aim to keep the start,end for the most frequent
761  * value and assume that is the correct coordinate and all others are
762  * minor fluctuations due to errors or variants.
763  *
764  * We sort by frequency first and then merge later items in the list into
765  * the earlier more frequent ones.  It's O(N^2), but sufficient for now
766  * given current scale of projects.
767  *
768  * If we ever need to resolve that then consider sorting by start
769  * coordinate and scanning the list to find all items within X, find
770  * the most frequent of those, and then cluster that way.  (I'd have
771  * done that had I thought of it at the time!)
772  */
aggregate_tcoord(astats_args_t * args,tcoord_t * tpos,size_t * np)773 static void aggregate_tcoord(astats_args_t *args, tcoord_t *tpos, size_t *np){
774     size_t n = *np, j, j2, j3, k;
775 
776     // Sort by frequency and cluster infrequent coords into frequent
777     // ones provided they're close by.
778     // This is O(N^2), but we've already binned by tcoord_bin/2 so
779     // the list isn't intended to be vast at this point.
780     qsort(tpos, n, sizeof(*tpos), tcoord_freq_sort);
781 
782     // For frequency ties, find mid start coord, and then find mid end
783     // coord of those matching start.
784     // We make that the first item so we merge into that mid point.
785     for (j = 0; j < n; j++) {
786         for (j2 = j+1; j2 < n; j2++) {
787             if (tpos[j].freq != tpos[j2].freq)
788                 break;
789             if (tpos[j2].start - tpos[j].start >= args->tcoord_bin)
790                 break;
791         }
792 
793         // j to j2 all within bin of a common start,
794         // m is the mid start.
795         if (j2-1 > j) {
796             size_t m = (j2-1 + j)/2;
797 
798             // Find mid end for this same start
799             while (m > 1 && tpos[m].start == tpos[m-1].start)
800                 m--;
801             for (j3 = m+1; j3 < j2; j3++) {
802                 if (tpos[m].start != tpos[j3].start)
803                     break;
804                 if (tpos[m].end - tpos[j3].end >= args->tcoord_bin)
805                     break;
806             }
807             if (j3-1 > m)
808                 m = (j3-1 + m)/2;
809 
810             // Swap with first item.
811             tcoord_t tmp = tpos[j];
812             tpos[j] = tpos[m];
813             tpos[m] = tmp;
814             j = j2-1;
815         }
816     }
817 
818     // Now merge in coordinates.
819     // This bit is O(N^2), so consider binning first to reduce the
820     // size of the list if we have excessive positional variation.
821     for (k = j = 0; j < n; j++) {
822         if (!tpos[j].freq)
823             continue;
824 
825         if (k < j)
826             tpos[k] = tpos[j];
827 
828         for (j2 = j+1; j2 < n; j2++) {
829             if (ABS(tpos[j].start-tpos[j2].start) < args->tcoord_bin/2 &&
830                 ABS(tpos[j].end  -tpos[j2].end)  < args->tcoord_bin/2 &&
831                 tpos[j].status == tpos[j2].status) {
832                 tpos[k].freq += tpos[j2].freq;
833                 tpos[j2].freq = 0;
834             }
835         }
836         k++;
837     }
838 
839     *np = k;
840 }
841 
dump_stats(astats_args_t * args,char type,char * name,int nfile,amplicons_t * amps,int nref,int local)842 int dump_stats(astats_args_t *args, char type, char *name, int nfile,
843                amplicons_t *amps, int nref, int local) {
844     int i, r;
845     FILE *ofp = args->out_fp;
846     tcoord_t *tpos = NULL;
847     size_t ntcoord = 0;
848 
849     // summary stats for this sample (or for all samples)
850     fprintf(ofp, "# Summary stats.\n");
851     fprintf(ofp, "# Use 'grep ^%cSS | cut -f 2-' to extract this part.\n", type);
852 
853     for (r = 0; r < nref; r++) {
854         if (!amps[r].sites)
855             continue;
856         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
857         int nmatch = stats->nseq - stats->nfiltered - stats->nfailprimer;
858         char *name_ref = malloc(strlen(name) + strlen(amps[r].ref) + 2);
859         if (!name_ref)
860             return -1;
861         if (args->multi_ref)
862             sprintf(name_ref, "%s\t%s", name, amps[r].ref);
863         else
864             sprintf(name_ref, "%s", name);
865         fprintf(ofp, "%cSS\t%s\traw total sequences:\t%d\n",
866                 type, name_ref, stats->nseq);
867         fprintf(ofp, "%cSS\t%s\tfiltered sequences:\t%d\n",
868                 type, name_ref, stats->nfiltered);
869         fprintf(ofp, "%cSS\t%s\tfailed primer match:\t%d\n",
870                 type, name_ref, stats->nfailprimer);
871         fprintf(ofp, "%cSS\t%s\tmatching sequences:\t%d\n",
872                 type, name_ref, nmatch);
873 
874         int d = 0;
875         do {
876             // From first to last amplicon only, so not entire consensus.
877             // If contig length is known, maybe we want to add the missing
878             // count to < DEPTH figures?
879             int64_t start = 0, covered = 0, total = 0;
880             amplicon_t *amp = amps[r].amp;
881             for (i = 0; i < amps[r].namp; i++) {
882                 int64_t j, offset = amp[i].min_left-1;
883                 if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) {
884                     fprintf(samtools_stderr, "[ampliconstats] error: "
885                             "Maximum amplicon length (%d) exceeded for '%s'\n",
886                             stats->max_amp, name);
887                     return -1;
888                 }
889                 for (j = MAX(start, amp[i].max_left-1);
890                      j < MAX(start, amp[i].min_right); j++) {
891                     if (stats->coverage[i*stats->max_amp_len + j-offset]
892                         >= args->min_depth[d])
893                         covered++;
894                     total++;
895                 }
896                 start = MAX(start, amp[i].min_right);
897             }
898             fprintf(ofp, "%cSS\t%s\tconsensus depth count < %d and >= %d:\t%"
899                     PRId64"\t%"PRId64"\n", type, name_ref,
900                     args->min_depth[d], args->min_depth[d],
901                     total-covered, covered);
902         } while (++d < MAX_DEPTH && args->min_depth[d]);
903 
904         free(name_ref);
905     }
906 
907     // Read count
908     fprintf(ofp, "# Absolute matching read counts per amplicon.\n");
909     fprintf(ofp, "# Use 'grep ^%cREADS | cut -f 2-' to extract this part.\n", type);
910     fprintf(ofp, "%cREADS\t%s", type, name);
911     for (r = 0; r < nref; r++) {
912         if (!amps[r].sites)
913             continue;
914         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
915         for (i = 0; i < amps[r].namp; i++) {
916             fprintf(ofp, "\t%"PRId64, stats->nreads[i]);
917         }
918     }
919     fprintf(ofp, "\n");
920 
921     // Valid depth is the number of full length reads (already divided
922     // by the number we expect to cover), so +0.5 per read in pair.
923     // A.k.a "usable depth" in the plots.
924     fprintf(ofp, "%cVDEPTH\t%s", type, name);
925     for (r = 0; r < nref; r++) {
926         if (!amps[r].sites)
927             continue;
928         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
929         for (i = 0; i < amps[r].namp; i++)
930             fprintf(ofp, "\t%d", (int)stats->nfull_reads[i]);
931     }
932     fprintf(ofp, "\n");
933 
934     if (type == 'C') {
935         // For combined we can compute mean & standard deviation too
936         fprintf(ofp, "CREADS\tMEAN");
937         for (r = 0; r < nref; r++) {
938             if (!amps[r].sites)
939                 continue;
940             astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
941             for (i = 0; i < amps[r].namp; i++) {
942                 fprintf(ofp, "\t%.1f", stats->nreads[i] / (double)nfile);
943             }
944         }
945         fprintf(ofp, "\n");
946 
947         fprintf(ofp, "CREADS\tSTDDEV");
948         for (r = 0; r < nref; r++) {
949             if (!amps[r].sites)
950                 continue;
951             astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
952             for (i = 0; i < amps[r].namp; i++) {
953                 double n1 = stats->nreads[i];
954                 fprintf(ofp, "\t%.1f", nfile > 1 && stats->nreads2[i] > 0
955                         ? sqrt(stats->nreads2[i]/(double)nfile
956                                - (n1/nfile)*(n1/nfile))
957                         : 0);
958             }
959         }
960         fprintf(ofp, "\n");
961     }
962 
963     fprintf(ofp, "# Read percentage of distribution between amplicons.\n");
964     fprintf(ofp, "# Use 'grep ^%cRPERC | cut -f 2-' to extract this part.\n", type);
965     fprintf(ofp, "%cRPERC\t%s", type, name);
966     int all_nseq = 0;
967     for (r = 0; r < nref; r++) {
968         if (!amps[r].sites)
969             continue;
970         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
971         all_nseq  += stats->nseq - stats->nfiltered - stats->nfailprimer;
972     }
973     for (r = 0; r < nref; r++) {
974         if (!amps[r].sites)
975             continue;
976         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
977         for (i = 0; i < amps[r].namp; i++) {
978             if (type == 'C') {
979                 fprintf(ofp, "\t%.3f", (double)stats->nrperc[i] / nfile);
980             } else {
981                 fprintf(ofp, "\t%.3f",
982                         all_nseq ? 100.0 * stats->nreads[i] / all_nseq : 0);
983             }
984         }
985     }
986     fprintf(ofp, "\n");
987 
988     if (type == 'C') {
989         // For combined we compute mean and standard deviation too
990         fprintf(ofp, "CRPERC\tMEAN");
991         for (r = 0; r < nref; r++) {
992             if (!amps[r].sites)
993                 continue;
994             astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
995             for (i = 0; i < amps[r].namp; i++) {
996                 fprintf(ofp, "\t%.3f", stats->nrperc[i] / nfile);
997             }
998         }
999         fprintf(ofp, "\n");
1000 
1001         fprintf(ofp, "CRPERC\tSTDDEV");
1002         for (r = 0; r < nref; r++) {
1003             if (!amps[r].sites)
1004                 continue;
1005             astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1006             for (i = 0; i < amps[r].namp; i++) {
1007                 // variance = SUM(X^2) - ((SUM(X)^2) / N)
1008                 double n1 = stats->nrperc[i];
1009                 double v = stats->nrperc2[i]/nfile - (n1/nfile)*(n1/nfile);
1010                 fprintf(ofp, "\t%.3f", v>0?sqrt(v):0);
1011             }
1012         }
1013         fprintf(ofp, "\n");
1014     }
1015 
1016     // Base depth
1017     fprintf(ofp, "# Read depth per amplicon.\n");
1018     fprintf(ofp, "# Use 'grep ^%cDEPTH | cut -f 2-' to extract this part.\n", type);
1019     fprintf(ofp, "%cDEPTH\t%s", type, name);
1020     for (r = 0; r < nref; r++) {
1021         if (!amps[r].sites)
1022             continue;
1023         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1024         amplicon_t *amp = amps[r].amp;
1025         for (i = 0; i < amps[r].namp; i++) {
1026             int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer;
1027             int64_t alen = amp[i].min_right - amp[i].max_left+1;
1028             fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen : 0);
1029         }
1030     }
1031     fprintf(ofp, "\n");
1032 
1033     if (type == 'C') {
1034         // For combined we can compute mean & standard deviation too
1035         fprintf(ofp, "CDEPTH\tMEAN");
1036         for (r = 0; r < nref; r++) {
1037             if (!amps[r].sites)
1038                 continue;
1039             astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1040             amplicon_t *amp = amps[r].amp;
1041             int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer;
1042             for (i = 0; i < amps[r].namp; i++) {
1043                 int64_t alen = amp[i].min_right - amp[i].max_left+1;
1044                 fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen / nfile : 0);
1045             }
1046         }
1047         fprintf(ofp, "\n");
1048 
1049         fprintf(ofp, "CDEPTH\tSTDDEV");
1050         for (r = 0; r < nref; r++) {
1051             if (!amps[r].sites)
1052                 continue;
1053             astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1054             amplicon_t *amp = amps[r].amp;
1055             for (i = 0; i < amps[r].namp; i++) {
1056                 double alen = amp[i].min_right - amp[i].max_left+1;
1057                 double n1 = stats->nbases[i] / alen;
1058                 double v = stats->nbases2[i] / (alen*alen) /nfile
1059                     - (n1/nfile)*(n1/nfile);
1060                 fprintf(ofp, "\t%.1f", v>0?sqrt(v):0);
1061             }
1062         }
1063         fprintf(ofp, "\n");
1064     }
1065 
1066     // Percent Coverage
1067     if (type == 'F') {
1068         fprintf(ofp, "# Percentage coverage per amplicon\n");
1069         fprintf(ofp, "# Use 'grep ^%cPCOV | cut -f 2-' to extract this part.\n", type);
1070         int d = 0;
1071         do {
1072             fprintf(ofp, "%cPCOV-%d\t%s", type, args->min_depth[d], name);
1073 
1074             for (r = 0; r < nref; r++) {
1075                 if (!amps[r].sites)
1076                     continue;
1077                 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1078                 amplicon_t *amp = amps[r].amp;
1079                 for (i = 0; i < amps[r].namp; i++) {
1080                     int covered = 0;
1081                     if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) {
1082                         fprintf(samtools_stderr, "[ampliconstats] error: "
1083                                 "Maximum amplicon length (%d) exceeded for '%s'\n",
1084                                 stats->max_amp, name);
1085                         return -1;
1086                     }
1087                     int64_t j, offset = amp[i].min_left-1;
1088                     for (j = amp[i].max_left-1; j < amp[i].min_right; j++) {
1089                         int apos = i*stats->max_amp_len + j-offset;
1090                         if (stats->coverage[apos] >= args->min_depth[d])
1091                             covered++;
1092                     }
1093                     int64_t alen = amp[i].min_right - amp[i].max_left+1;
1094                     stats->covered_perc[i][d] = 100.0 * covered / alen;
1095                     fprintf(ofp, "\t%.2f", 100.0 * covered / alen);
1096                 }
1097             }
1098             fprintf(ofp, "\n");
1099         } while (++d < MAX_DEPTH && args->min_depth[d]);
1100 
1101     } else if (type == 'C') {
1102         // For combined we can compute mean & standard deviation too
1103         int d = 0;
1104         do {
1105             fprintf(ofp, "CPCOV-%d\tMEAN", args->min_depth[d]);
1106             for (r = 0; r < nref; r++) {
1107                 if (!amps[r].sites)
1108                     continue;
1109                 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1110                 for (i = 0; i < amps[r].namp; i++) {
1111                     fprintf(ofp, "\t%.1f", stats->covered_perc[i][d] / nfile);
1112                 }
1113             }
1114             fprintf(ofp, "\n");
1115 
1116             fprintf(ofp, "CPCOV-%d\tSTDDEV", args->min_depth[d]);
1117             for (r = 0; r < nref; r++) {
1118                 if (!amps[r].sites)
1119                     continue;
1120                 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1121                 for (i = 0; i < amps[r].namp; i++) {
1122                     double n1 = stats->covered_perc[i][d] / nfile;
1123                     double v = stats->covered_perc2[i][d] / nfile - n1*n1;
1124                     fprintf(ofp, "\t%.1f", v>0?sqrt(v):0);
1125                 }
1126             }
1127             fprintf(ofp, "\n");
1128         } while (++d < MAX_DEPTH && args->min_depth[d]);
1129     }
1130 
1131     // Plus base depth for all reads, irrespective of amplicon.
1132     // This is post overlap removal, if reads in the read-pair overlap.
1133     fprintf(ofp, "# Depth per reference base for ALL data.\n");
1134     fprintf(ofp, "# Use 'grep ^%cDP_ALL | cut -f 2-' to extract this part.\n",
1135             type);
1136     for (r = 0; r < nref; r++) {
1137         if (!amps[r].sites)
1138             continue;
1139         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1140         if (args->multi_ref)
1141             fprintf(ofp, "%cDP_ALL\t%s\t%s", type, name, amps[r].ref);
1142         else
1143             fprintf(ofp, "%cDP_ALL\t%s", type, name);
1144 
1145         for (i = 0; i < amps[r].len; i++) {
1146             // Basic run-length encoding provided all values are within
1147             // +- depth_bin fraction of the mid-point.
1148             int dmin = stats->depth_all[i], dmax = stats->depth_all[i], j;
1149             double dmid = (dmin + dmax)/2.0;
1150             double low  = dmid*(1-args->depth_bin);
1151             double high = dmid*(1+args->depth_bin);
1152             for (j = i+1; j < amps[r].len; j++) {
1153                 int d = stats->depth_all[j];
1154                 if (d < low || d > high)
1155                     break;
1156                 if (dmin > d) {
1157                     dmin = d;
1158                     dmid = (dmin + dmax)/2.0;
1159                     low  = dmid*(1-args->depth_bin);
1160                     high = dmid*(1+args->depth_bin);
1161                 } else if (dmax < d) {
1162                     dmax = d;
1163                     dmid = (dmin + dmax)/2.0;
1164                     low  = dmid*(1-args->depth_bin);
1165                     high = dmid*(1+args->depth_bin);
1166                 }
1167             }
1168             fprintf(ofp, "\t%d,%d", (int)dmid, j-i);
1169             i = j-1;
1170         }
1171         fprintf(ofp, "\n");
1172     }
1173 
1174     // And depth for only reads matching to a single amplicon for full
1175     // length.  This is post read overlap removal.
1176     fprintf(ofp, "# Depth per reference base for full-length valid amplicon data.\n");
1177     fprintf(ofp, "# Use 'grep ^%cDP_VALID | cut -f 2-' to extract this "
1178             "part.\n", type);
1179     for (r = 0; r < nref; r++) {
1180         if (!amps[r].sites)
1181             continue;
1182         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1183         if (args->multi_ref)
1184             fprintf(ofp, "%cDP_VALID\t%s\t%s", type, name, amps[r].ref);
1185         else
1186             fprintf(ofp, "%cDP_VALID\t%s", type, name);
1187 
1188         for (i = 0; i < amps[r].len; i++) {
1189             int dmin = stats->depth_valid[i], dmax = stats->depth_valid[i], j;
1190             double dmid = (dmin + dmax)/2.0;
1191             double low  = dmid*(1-args->depth_bin);
1192             double high = dmid*(1+args->depth_bin);
1193             for (j = i+1; j < amps[r].len; j++) {
1194                 int d = stats->depth_valid[j];
1195                 if (d < low || d > high)
1196                     break;
1197                 if (dmin > d) {
1198                     dmin = d;
1199                     dmid = (dmin + dmax)/2.0;
1200                     low  = dmid*(1-args->depth_bin);
1201                     high = dmid*(1+args->depth_bin);
1202                 } else if (dmax < d) {
1203                     dmax = d;
1204                     dmid = (dmin + dmax)/2.0;
1205                     low  = dmid*(1-args->depth_bin);
1206                     high = dmid*(1+args->depth_bin);
1207                 }
1208             }
1209             fprintf(ofp, "\t%d,%d", (int)dmid, j-i);
1210             i = j-1;
1211         }
1212         fprintf(ofp, "\n");
1213     }
1214 
1215     // TCOORD (start to end) distribution
1216     fprintf(ofp, "# Distribution of aligned template coordinates.\n");
1217     fprintf(ofp, "# Use 'grep ^%cTCOORD | cut -f 2-' to extract this part.\n", type);
1218     for (r = 0; r < nref; r++) {
1219         if (!amps[r].sites)
1220             continue;
1221         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1222         for (i = 0 - (nref==1); i < amps[r].namp; i++) {
1223             if (ntcoord < kh_size(stats->tcoord[i+1])) {
1224                 ntcoord = kh_size(stats->tcoord[i+1]);
1225                 tcoord_t *tmp = realloc(tpos, ntcoord * sizeof(*tmp));
1226                 if (!tmp) {
1227                     free(tpos);
1228                     return -1;
1229                 }
1230                 tpos = tmp;
1231             }
1232 
1233             khiter_t k;
1234             size_t n = 0, j;
1235             for (k = kh_begin(stats->tcoord[i+1]);
1236                  k != kh_end(stats->tcoord[i+1]); k++) {
1237                 if (!kh_exist(stats->tcoord[i+1], k) ||
1238                     (kh_value(stats->tcoord[i+1], k) & 0xFFFFFFFF) == 0)
1239                     continue;
1240                 // Key is start,end in 32-bit quantities.
1241                 // Yes this limits us to 4Gb references, but just how
1242                 // many primers are we planning on making?  Not that many
1243                 // I hope.
1244                 tpos[n].start = kh_key(stats->tcoord[i+1], k)&0xffffffff;
1245                 tpos[n].end   = kh_key(stats->tcoord[i+1], k)>>32;
1246 
1247                 // Value is frequency (top 32-bits) and status (bottom 32).
1248                 tpos[n].freq   = kh_value(stats->tcoord[i+1], k)&0xffffffff;
1249                 tpos[n].status = kh_value(stats->tcoord[i+1], k)>>32;
1250                 n++;
1251             }
1252 
1253             if (args->tcoord_bin > 1)
1254                 aggregate_tcoord(args, tpos, &n);
1255 
1256             fprintf(ofp, "%cTCOORD\t%s\t%d", type, name,
1257                     i+1+amps[r].first_amp); // per amplicon
1258             for (j = 0; j < n; j++) {
1259                 if (tpos[j].freq < args->tcoord_min_count)
1260                     continue;
1261                 fprintf(ofp, "\t%d,%d,%u,%u",
1262                         tpos[j].start,
1263                         tpos[j].end,
1264                         tpos[j].freq,
1265                         tpos[j].status);
1266             }
1267             fprintf(ofp, "\n");
1268         }
1269     }
1270 
1271 
1272     // AMP length distribution.
1273     // 0 = both ends in this amplicon
1274     // 1 = ends in different amplicons
1275     // 2 = other end matching an unknown amplicon site
1276     //     (see tcoord for further analysis of where)
1277     fprintf(ofp, "# Classification of amplicon status.  Columns are\n");
1278     fprintf(ofp, "# number with both primers from this amplicon, number with\n");
1279     fprintf(ofp, "# primers from different amplicon, and number with a position\n");
1280     fprintf(ofp, "# not matching any valid amplicon primer site\n");
1281     fprintf(ofp, "# Use 'grep ^%cAMP | cut -f 2-' to extract this part.\n", type);
1282 
1283     fprintf(ofp, "%cAMP\t%s\t0", type, name); // all merged
1284     int amp_dist[3] = {0};
1285     for (r = 0; r < nref; r++) {
1286         if (!amps[r].sites)
1287             continue;
1288         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1289         for (i = 0; i < amps[r].namp; i++) { // accumulate for all amps
1290             amp_dist[0] += stats->amp_dist[i][0];
1291             amp_dist[1] += stats->amp_dist[i][1];
1292             amp_dist[2] += stats->amp_dist[i][2];
1293         }
1294     }
1295     fprintf(ofp, "\t%d\t%d\t%d\n", amp_dist[0], amp_dist[1], amp_dist[2]);
1296 
1297     for (r = 0; r < nref; r++) {
1298         if (!amps[r].sites)
1299             continue;
1300         astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1301         for (i = 0; i < amps[r].namp; i++) {
1302             // per amplicon
1303             fprintf(ofp, "%cAMP\t%s\t%d", type, name, i+1+amps[r].first_amp);
1304             fprintf(ofp, "\t%d\t%d\t%d\n", stats->amp_dist[i][0],
1305                     stats->amp_dist[i][1], stats->amp_dist[i][2]);
1306         }
1307     }
1308 
1309     free(tpos);
1310     return 0;
1311 }
1312 
dump_lstats(astats_args_t * args,char type,char * name,int nfile,amplicons_t * amps,int nref)1313 int dump_lstats(astats_args_t *args, char type, char *name, int nfile,
1314                amplicons_t *amps, int nref) {
1315     return dump_stats(args, type, name, nfile, amps, nref, 1);
1316 }
1317 
dump_gstats(astats_args_t * args,char type,char * name,int nfile,amplicons_t * amps,int nref)1318 int dump_gstats(astats_args_t *args, char type, char *name, int nfile,
1319                amplicons_t *amps, int nref) {
1320     return dump_stats(args, type, name, nfile, amps, nref, 0);
1321 }
1322 
get_sample_name(sam_hdr_t * header,char * RG)1323 char const *get_sample_name(sam_hdr_t *header, char *RG) {
1324     kstring_t ks = {0};
1325     sam_hdr_find_tag_id(header, "RG", RG?"ID":NULL, RG, "SM", &ks);
1326     return ks.s;
1327 }
1328 
1329 // Return maximum reference length (SQ is NULL) or the length
1330 // of the specified reference in SQ.
get_ref_len(sam_hdr_t * header,const char * SQ)1331 int64_t get_ref_len(sam_hdr_t *header, const char *SQ) {
1332     if (SQ) {
1333         int tid = SQ ? sam_hdr_name2tid(header, SQ) : 0;
1334         return tid >= 0 ? sam_hdr_tid2len(header, tid) : -1;
1335     } else {
1336         int nref = sam_hdr_nref(header), tid;;
1337         int64_t len = 0;
1338         for (tid = 0; tid < nref; tid++) {
1339             int64_t rl = sam_hdr_tid2len(header, tid);
1340             if (len < rl)
1341                 len = rl;
1342         }
1343         return len;
1344     }
1345 }
1346 
amplicon_stats(astats_args_t * args,khash_t (bed_list_hash)* bed_hash,char ** filev,int filec)1347 static int amplicon_stats(astats_args_t *args,
1348                           khash_t(bed_list_hash) *bed_hash,
1349                           char **filev, int filec) {
1350     int i, ref = -1, ref_tid = -1, ret = -1, nref = 0;
1351     samFile *fp = NULL;
1352     sam_hdr_t *header = NULL;
1353     bam1_t *b = bam_init1();
1354     FILE *ofp = args->out_fp;
1355     char sname_[8192], *sname = NULL;
1356     amplicons_t *amps = NULL;
1357 
1358     // Report initial SS header.  We gather data from the bed_hash entries
1359     // as well as from the first SAM header (with the requirement that all
1360     // headers should be compatible).
1361     if (filec) {
1362         if (!(fp = sam_open_format(filev[0], "r", &args->ga.in))) {
1363             print_error_errno("ampliconstats",
1364                               "Cannot open input file \"%s\"",
1365                               filev[0]);
1366             goto err;
1367         }
1368         if (!(header = sam_hdr_read(fp)))
1369             goto err;
1370 
1371         if (!amps) {
1372             amps = calloc(nref=sam_hdr_nref(header), sizeof(*amps));
1373             if (!amps)
1374                 goto err;
1375             fprintf(ofp, "# Summary statistics, used for scaling the plots.\n");
1376             fprintf(ofp, "SS\tSamtools version: %s\n", samtools_version());
1377             fprintf(ofp, "SS\tCommand line: %s\n", args->argv);
1378             fprintf(ofp, "SS\tNumber of files:\t%d\n", filec);
1379 
1380             // Note: order of hash entries will be different to order of
1381             // BED file which may also differ to order of SQ headers.
1382             // SQ header is canonical ordering (pos sorted file).
1383             khiter_t k;
1384             int bam_nref = sam_hdr_nref(header);
1385             for (i = 0; i < bam_nref; i++) {
1386                 k = kh_get(bed_list_hash, bed_hash,
1387                            sam_hdr_tid2name(header, i));
1388                 if (!kh_exist(bed_hash, k))
1389                     continue;
1390 
1391                 bed_entry_list_t *sites = &kh_value(bed_hash, k);
1392 
1393                 ref = i;
1394                 amps[ref].ref = kh_key(bed_hash, k);
1395                 amps[ref].sites = sites;
1396                 amps[ref].namp = count_amplicon(sites);
1397                 amps[ref].amp  = calloc(sites->length,
1398                                         sizeof(*amps[ref].amp));
1399                 if (!amps[ref].amp)
1400                     goto err;
1401                 if (args->multi_ref)
1402                     fprintf(ofp, "SS\tNumber of amplicons:\t%s\t%d\n",
1403                             kh_key(bed_hash, k), amps[ref].namp);
1404                 else
1405                     fprintf(ofp, "SS\tNumber of amplicons:\t%d\n",
1406                             amps[ref].namp);
1407 
1408                 amps[ref].tid = ref;
1409                 if (ref_tid == -1)
1410                     ref_tid = ref;
1411 
1412                 int64_t len = get_ref_len(header, kh_key(bed_hash, k));
1413                 amps[ref].len = len;
1414                 if (args->multi_ref)
1415                     fprintf(ofp, "SS\tReference length:\t%s\t%"PRId64"\n",
1416                             kh_key(bed_hash, k), len);
1417                 else
1418                     fprintf(ofp, "SS\tReference length:\t%"PRId64"\n",
1419                             len);
1420 
1421                 amps[ref].lstats = stats_alloc(len, args->max_amp,
1422                                                args->max_amp_len);
1423                 amps[ref].gstats = stats_alloc(len, args->max_amp,
1424                                                args->max_amp_len);
1425                 if (!amps[ref].lstats || !amps[ref].gstats)
1426                     goto err;
1427             }
1428         }
1429 
1430         sam_hdr_destroy(header);
1431         header = NULL;
1432         if (sam_close(fp) < 0) {
1433             fp = NULL;
1434             goto err;
1435         }
1436         fp = NULL;
1437     }
1438     fprintf(ofp, "SS\tEnd of summary\n");
1439 
1440     // Extract the bits of amplicon data we need from bed hash and turn
1441     // it into a position-to-amplicon lookup table.
1442     int offset = 0;
1443     for (i = 0; i < nref; i++) {
1444         if (!amps[i].sites)
1445             continue;
1446 
1447         amps[i].first_amp = offset;
1448         if (bed2amplicon(args, amps[i].sites, amps[i].amp,
1449                          &amps[i].namp, i==0, amps[i].ref, offset) < 0)
1450             goto err;
1451 
1452         offset += amps[i].namp; // cumulative amplicon number across refs
1453     }
1454 
1455     // Now iterate over file contents, one at a time.
1456     for (i = 0; i < filec; i++) {
1457         char *nstart = filev[i];
1458 
1459         fp = sam_open_format(filev[i], "r", &args->ga.in);
1460         if (!fp) {
1461             print_error_errno("ampliconstats",
1462                               "Cannot open input file \"%s\"",
1463                               filev[i]);
1464             goto err;
1465         }
1466 
1467         if (args->ga.nthreads > 0)
1468             hts_set_threads(fp, args->ga.nthreads);
1469 
1470         if (!(header = sam_hdr_read(fp)))
1471             goto err;
1472 
1473         if (nref != sam_hdr_nref(header)) {
1474             print_error_errno("ampliconstats",
1475                               "SAM headers are not consistent across input files");
1476             goto err;
1477         }
1478         int r;
1479         for (r = 0; r < nref; r++) {
1480             if (!amps[r].ref ||
1481                 strcmp(amps[r].ref, sam_hdr_tid2name(header, r)) != 0 ||
1482                 amps[r].len != sam_hdr_tid2len(header, r)) {
1483                 print_error_errno("ampliconstats",
1484                                   "SAM headers are not consistent across "
1485                                   "input files");
1486                 goto err;
1487             }
1488         }
1489 
1490         if (args->use_sample_name)
1491             sname = (char *)get_sample_name(header, NULL);
1492 
1493         if (!sname) {
1494             sname = sname_;
1495             char *nend = filev[i] + strlen(filev[i]), *cp;
1496             if ((cp = strrchr(filev[i], '/')))
1497                 nstart = cp+1;
1498             if ((cp = strrchr(nstart, '.')) &&
1499                 (strcmp(cp, ".bam") == 0 ||
1500                  strcmp(cp, ".sam") == 0 ||
1501                  strcmp(cp, ".cram") == 0))
1502                 nend = cp;
1503             if (nend - nstart >= 8192) nend = nstart+8191;
1504             memcpy(sname, nstart, nend-nstart);
1505             sname[nend-nstart] = 0;
1506         }
1507 
1508         // Stats local to this sample only
1509         amp_stats_reset(amps, nref);
1510 
1511         int last_ref = -9;
1512         while ((r = sam_read1(fp, header, b)) >= 0) {
1513             // Other filter options useful here?
1514             if (b->core.tid < 0)
1515                 continue;
1516 
1517             if (last_ref != b->core.tid) {
1518                 last_ref  = b->core.tid;
1519                 if (initialise_amp_pos_lookup(args, amps, last_ref) < 0)
1520                     goto err;
1521             }
1522 
1523             if (accumulate_stats(args, amps, b) < 0)
1524                 goto err;
1525         }
1526 
1527         if (r < -1) {
1528             print_error_errno("ampliconstats", "Fail reading record");
1529             goto err;
1530         }
1531 
1532         sam_hdr_destroy(header);
1533         if (sam_close(fp) < 0) {
1534             fp = NULL;
1535             goto err;
1536         }
1537 
1538         fp = NULL;
1539         header = NULL;
1540 
1541         if (dump_lstats(args, 'F', sname, filec, amps, nref) < 0)
1542             goto err;
1543 
1544         if (append_stats(amps, nref) < 0)
1545             goto err;
1546 
1547         if (sname && sname != sname_)
1548             free(sname);
1549         sname = NULL;
1550     }
1551 
1552     if (dump_gstats(args, 'C', "COMBINED", filec, amps, nref) < 0)
1553         goto err;
1554 
1555     ret = 0;
1556  err:
1557     bam_destroy1(b);
1558     if (ret) {
1559         if (header)
1560             sam_hdr_destroy(header);
1561         if (fp)
1562             sam_close(fp);
1563     }
1564     for (i = 0; i < nref; i++) {
1565         stats_free(amps[i].lstats);
1566         stats_free(amps[i].gstats);
1567         free(amps[i].amp);
1568     }
1569     free(amps);
1570     free(pos2start);
1571     free(pos2end);
1572     if (ret) {
1573         if (sname && sname != sname_)
1574             free(sname);
1575     }
1576 
1577     return ret;
1578 }
1579 
usage(astats_args_t * args,FILE * fp,int exit_status)1580 static int usage(astats_args_t *args, FILE *fp, int exit_status) {
1581     fprintf(fp,
1582 "\n"
1583 "Usage: samtools ampliconstats [options] primers.bed *.bam > astats.txt\n"
1584 "\n"
1585 "Options:\n");
1586     fprintf(fp, "  -f, --required-flag STR|INT\n"
1587             "               Only include reads with all of the FLAGs present [0x%X]\n",args->flag_require);
1588     fprintf(fp, "  -F, --filter-flag STR|INT\n"
1589             "               Only include reads with none of the FLAGs present [0x%X]\n",args->flag_filter & 0xffff);
1590     fprintf(fp, "  -a, --max-amplicons INT\n"
1591             "               Change the maximum number of amplicons permitted [%d]\n", MAX_AMP);
1592     fprintf(fp, "  -l, --max-amplicon-length INT\n"
1593             "               Change the maximum length of an individual amplicon [%d]\n", MAX_AMP_LEN);
1594     fprintf(fp, "  -d, --min-depth INT[,INT]...\n"
1595             "               Minimum base depth(s) to consider position covered [%d]\n", args->min_depth[0]);
1596     fprintf(fp, "  -m, --pos-margin INT\n"
1597             "               Margin of error for matching primer positions [%d]\n", args->max_delta);
1598     fprintf(fp, "  -o, --output FILE\n"
1599             "               Specify output file [samtools_stdout if unset]\n");
1600     fprintf(fp, "  -s, --use-sample-name\n"
1601             "               Use the sample name from the first @RG header line\n");
1602     fprintf(fp, "  -t, --tlen-adjust INT\n"
1603             "               Add/subtract from TLEN; use when clipping but no fixmate step\n");
1604     fprintf(fp, "  -b, --tcoord-bin INT\n"
1605             "               Bin template start,end positions into multiples of INT[1]\n");
1606     fprintf(fp, "  -c, --tcoord-min-count INT\n"
1607             "               Minimum template start,end frequency for recording [%d]\n", TCOORD_MIN_COUNT);
1608     fprintf(fp, "  -D, --depth-bin FRACTION\n"
1609             "               Merge FDP values within +/- FRACTION together\n");
1610     fprintf(fp, "  -S, --single-ref\n"
1611             "               Force single-ref (<=1.12) output format\n");
1612     sam_global_opt_help(fp, "I.--.@");
1613 
1614     return exit_status;
1615 }
1616 
main_ampliconstats(int argc,char ** argv)1617 int main_ampliconstats(int argc, char **argv) {
1618     astats_args_t args = {
1619         .ga = SAM_GLOBAL_ARGS_INIT,
1620         .flag_require = 0,
1621         .flag_filter = 0x10B04,
1622         //.sites = BED_LIST_INIT,
1623         .max_delta = 30, // large enough to cope with alt primers
1624         .min_depth = {1},
1625         .use_sample_name = 0,
1626         .max_amp = MAX_AMP,
1627         .max_amp_len = MAX_AMP_LEN,
1628         .tlen_adj = 0,
1629         .out_fp = samtools_stdout,
1630         .tcoord_min_count = TCOORD_MIN_COUNT,
1631         .tcoord_bin = 1,
1632         .depth_bin = 0.01,
1633         .multi_ref = 1
1634     }, oargs = args;
1635 
1636     static const struct option loptions[] =
1637     {
1638         SAM_OPT_GLOBAL_OPTIONS('I', 0, '-', '-', 0, '@'),
1639         {"help", no_argument, NULL, 'h'},
1640         {"flag-require", required_argument, NULL, 'f'},
1641         {"flag-filter", required_argument, NULL, 'F'},
1642         {"min-depth", required_argument, NULL, 'd'},
1643         {"output", required_argument, NULL, 'o'},
1644         {"pos-margin", required_argument, NULL, 'm'},
1645         {"use-sample-name", no_argument, NULL, 's'},
1646         {"max-amplicons", required_argument, NULL, 'a'},
1647         {"max-amplicon-length", required_argument, NULL, 'l'},
1648         {"tlen-adjust", required_argument, NULL, 't'},
1649         {"tcoord-min-count", required_argument, NULL, 'c'},
1650         {"tcoord-bin", required_argument, NULL, 'b'},
1651         {"depth-bin", required_argument, NULL, 'D'},
1652         {"single-ref", no_argument, NULL, 'S'},
1653         {NULL, 0, NULL, 0}
1654     };
1655     int opt;
1656 
1657     while ( (opt=getopt_long(argc,argv,"?hf:F:@:p:m:d:sa:l:t:o:c:b:D:S",loptions,NULL))>0 ) {
1658         switch (opt) {
1659         case 'f': args.flag_require = bam_str2flag(optarg); break;
1660         case 'F':
1661             if (args.flag_filter & 0x10000)
1662                 args.flag_filter = 0; // strip default on first -F usage
1663             args.flag_filter |= bam_str2flag(optarg); break;
1664 
1665         case 'm': args.max_delta = atoi(optarg); break; // margin
1666         case 'D': args.depth_bin = atof(optarg); break; // depth bin fraction
1667         case 'd': {
1668             int d = 0;
1669             char *cp = optarg, *ep;
1670             do {
1671                 long n = strtol(cp, &ep, 10);
1672                 args.min_depth[d++] = n;
1673                 if (*ep != ',')
1674                     break;
1675                 cp = ep+1;
1676             } while (d < MAX_DEPTH);
1677             break;
1678         }
1679 
1680         case 'a': args.max_amp = atoi(optarg)+1;break;
1681         case 'l': args.max_amp_len = atoi(optarg)+1;break;
1682 
1683         case 'c': args.tcoord_min_count = atoi(optarg);break;
1684         case 'b':
1685             args.tcoord_bin = atoi(optarg);
1686             if (args.tcoord_bin < 1)
1687                 args.tcoord_bin = 1;
1688             break;
1689 
1690         case 't': args.tlen_adj = atoi(optarg);break;
1691 
1692         case 's': args.use_sample_name = 1;break;
1693 
1694         case 'o':
1695             if (!(args.out_fp = fopen(optarg, "w"))) {
1696                 perror(optarg);
1697                 return 1;
1698             }
1699             break;
1700 
1701         case 'S':
1702             args.multi_ref = 0;
1703             break;
1704 
1705         case '?': return usage(&oargs, samtools_stderr, EXIT_FAILURE);
1706         case 'h': return usage(&oargs, samtools_stdout, EXIT_SUCCESS);
1707 
1708         default:
1709             if (parse_sam_global_opt(opt, optarg, loptions, &args.ga) != 0)
1710                 usage(&oargs,samtools_stderr, EXIT_FAILURE);
1711             break;
1712         }
1713     }
1714 
1715     if (argc <= optind)
1716         return usage(&oargs, samtools_stdout, EXIT_SUCCESS);
1717     if (argc <= optind+1 && isatty(STDIN_FILENO))
1718         return usage(&oargs, samtools_stderr, EXIT_FAILURE);
1719 
1720     khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
1721     if (load_bed_file_multi_ref(argv[optind], 1, 0, bed_hash)) {
1722         print_error_errno("ampliconstats",
1723                           "Could not read file \"%s\"", argv[optind]);
1724         return 1;
1725 
1726     }
1727 
1728     khiter_t k, ref_count = 0;
1729     for (k = kh_begin(bed_hash); k != kh_end(bed_hash); k++) {
1730         if (!kh_exist(bed_hash, k))
1731             continue;
1732         ref_count++;
1733     }
1734     if (ref_count == 0)
1735         return 1;
1736     if (ref_count > 1 && args.multi_ref == 0) {
1737         print_error("ampliconstats",
1738                     "Single-ref mode is not permitted for BED files\n"
1739                     "containing more than one reference.");
1740         return 1;
1741     }
1742 
1743     args.argv = stringify_argv(argc, argv);
1744     int ret;
1745     if (argc == ++optind) {
1746         char *av = "-";
1747         ret = amplicon_stats(&args, bed_hash, &av, 1);
1748     } else {
1749         ret = amplicon_stats(&args, bed_hash, &argv[optind], argc-optind);
1750     }
1751 
1752     free(args.argv);
1753     destroy_bed_hash(bed_hash);
1754 
1755     return ret;
1756 }
1757