1 /* stats.c -- This is the former bamcheck integrated into samtools/htslib.
2
3 Copyright (C) 2020-2021 Genome Research Ltd.
4
5 Author: James Bonfield <jkb@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE. */
24
25 /*
26 * This tool is designed to give "samtools stats" style output, but dedicated
27 * to small amplicon sequencing projects. It gathers stats on the
28 * distribution of reads across amplicons.
29 */
30
31 /*
32 * TODO:
33 * - Cope with multiple references. What do we do here? Just request one?
34 * - Permit regions rather than consuming whole file (maybe solves above).
35 */
36
37 #include <config.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <inttypes.h>
42 #include <getopt.h>
43 #include <unistd.h>
44 #include <math.h>
45
46 #include <htslib/sam.h>
47 #include <htslib/khash.h>
48
49 #include "samtools.h"
50 #include "sam_opts.h"
51 #include "bam_ampliconclip.h"
52
53 KHASH_MAP_INIT_INT64(tcoord, int64_t)
54 KHASH_MAP_INIT_STR(qname, int64_t)
55
56 #ifndef MIN
57 #define MIN(a,b) ((a)<(b)?(a):(b))
58 #endif
59
60 #ifndef MAX
61 #define MAX(a,b) ((a)>(b)?(a):(b))
62 #endif
63
64 #ifndef ABS
65 #define ABS(a) ((a)>=0?(a):-(a))
66 #endif
67
68 #define TCOORD_MIN_COUNT 10
69 #define MAX_AMP 1000 // Default maximum number of amplicons
70 #define MAX_AMP_LEN 1000 // Default maximum length of any single amplicon
71 #define MAX_PRIMER_PER_AMPLICON 4 // Max primers per LEFT/RIGHT
72 #define MAX_DEPTH 5 // Number of different depths permitted
73
74 typedef struct {
75 sam_global_args ga;
76 uint32_t flag_require;
77 uint32_t flag_filter;
78 int max_delta; // Used for matching read to amplicon primer loc
79 int min_depth[MAX_DEPTH]; // Used for coverage; must be >= min_depth deep
80 int use_sample_name;
81 int max_amp; // Total number of amplicons
82 int max_amp_len; // Maximum length of an individual amplicon
83 double depth_bin;// aggregate depth within this fraction
84 int tlen_adj; // Adjust tlen by this amount, due to clip but no fixmate
85 FILE *out_fp;
86 char *argv;
87 int tcoord_min_count;
88 int tcoord_bin;
89 int multi_ref;
90 } astats_args_t;
91
92 typedef struct {
93 int nseq; // total sequence count
94 int nfiltered; // sequence filtered
95 int nfailprimer;// count of sequences not matching the primer locations
96
97 // Sizes of memory allocated below, to permit reset
98 int max_amp, max_amp_len, max_len;
99
100 // Summary across all samples, sum(x) plus sum(x^2) for s.d. calc
101 int64_t *nreads, *nreads2; // [max_amp]
102 double *nfull_reads; // [max_amp]; 0.5/read if paired.
103 double *nrperc, *nrperc2; // [max_amp]
104 int64_t *nbases, *nbases2; // [max_amp]
105 int64_t *coverage; // [max_amp][max_amp_len]
106 double (*covered_perc)[MAX_DEPTH]; // [max_amp][MAX_DEPTH]
107 double (*covered_perc2)[MAX_DEPTH];// [max_amp][MAX_DEPTH];
108 khash_t(tcoord) **tcoord; // [max_amp+1]
109
110 // 0 is correct pair, 1 is incorrect pair, 2 is unidentified
111 int (*amp_dist)[3]; // [MAX_AMP][3];
112
113 int *depth_valid; // [max_len]
114 int *depth_all; // [max_len]
115 khash_t(qname) *qend; // queryname end, for overlap removal
116 } astats_t;
117
118 // We can have multiple primers for LEFT / RIGHT, so this
119 // permits detection by any compatible combination.
120 // One reference:
121 typedef struct {
122 int64_t left[MAX_PRIMER_PER_AMPLICON];
123 int nleft;
124 int64_t right[MAX_PRIMER_PER_AMPLICON];
125 int nright;
126 int64_t max_left, min_right; // inner dimensions
127 int64_t min_left, max_right; // outer dimensions
128 } amplicon_t;
129
130 // Multiple references, we have an array of amplicons_t - one per used ref.
131 // We have per reference local and global stats here, as some of the stats
132 // are coordinate based. However we report them combined together as a single
133 // list across all references.
134 // "namp" is the number of amplicons in this reference, but they're
135 // numbered first_amp to first_amp+namp-1 inclusively.
136 typedef struct {
137 int tid, namp;
138 int64_t len;
139 bed_entry_list_t *sites;
140 amplicon_t *amp;
141 astats_t *lstats, *gstats; // local (1 file) and global (all file) stats
142 const char *ref; // ref name (pointer to the bed hash table key)
143 int first_amp; // first amplicon number for this ref
144 } amplicons_t;
145
146 // Reinitialised for each new reference/chromosome.
147 // Counts from 1 to namp, -1 for no match and 0 for ?.
148 static int *pos2start = NULL;
149 static int *pos2end = NULL;
150 static int pos2size = 0; // allocated size of pos2start/end
151
152 // Lookup table to go from position to amplicon based on
153 // read start / end.
initialise_amp_pos_lookup(astats_args_t * args,amplicons_t * amps,int ref)154 static int initialise_amp_pos_lookup(astats_args_t *args,
155 amplicons_t *amps,
156 int ref) {
157 int64_t i, j;
158 amplicon_t *amp = amps[ref].amp;
159 int64_t max_len = amps[ref].len;
160 int namp = amps[ref].namp;
161
162 if (max_len+1 > pos2size) {
163 if (!(pos2start = realloc(pos2start, (max_len+1)*sizeof(*pos2start))))
164 return -1;
165 if (!(pos2end = realloc(pos2end, (max_len+1)*sizeof(*pos2end))))
166 return -1;
167 pos2size = max_len;
168 }
169 for (i = 0; i < max_len; i++)
170 pos2start[i] = pos2end[i] = -1;
171
172 for (i = 0; i < namp; i++) {
173 for (j = 0; j < amp[i].nleft; j++) {
174 int64_t p;
175 for (p = amp[i].left[j] - args->max_delta;
176 p <= amp[i].left[j] + args->max_delta; p++) {
177 if (p < 1 || p > max_len)
178 continue;
179 pos2start[p-1] = i;
180 }
181 }
182 for (j = 0; j < amp[i].nright; j++) {
183 int64_t p;
184 for (p = amp[i].right[j] - args->max_delta;
185 p <= amp[i].right[j] + args->max_delta; p++) {
186 if (p < 1 || p > max_len)
187 continue;
188 pos2end[p-1] = i;
189 }
190 }
191 }
192
193 return 0;
194 }
195
196 // Counts amplicons.
197 // Assumption: input BED file alternates between LEFT and RIGHT primers
198 // per amplicon, thus we can count the number based on the switching
199 // orientation.
count_amplicon(bed_entry_list_t * sites)200 static int count_amplicon(bed_entry_list_t *sites) {
201 int i, namp, last_rev = 0;
202 for (i = namp = 0; i < sites->length; i++) {
203 if (sites->bp[i].rev == 0 && last_rev)
204 namp++;
205 last_rev = sites->bp[i].rev;
206 }
207
208 return ++namp;
209 }
210
211 // We're only interest in the internal part of the amplicon.
212 // Our bed file has LEFT start/end followed by RIGHT start/end,
213 // so collapse these to LEFT end / RIGHT start.
214 //
215 // Returns right most amplicon position on success,
216 // < 0 on error
bed2amplicon(astats_args_t * args,bed_entry_list_t * sites,amplicon_t * amp,int * namp,int do_title,const char * ref,int first_amp)217 static int64_t bed2amplicon(astats_args_t *args, bed_entry_list_t *sites,
218 amplicon_t *amp, int *namp, int do_title,
219 const char *ref, int first_amp) {
220 int i, j;
221 int64_t max_right = 0;
222 FILE *ofp = args->out_fp;
223
224 *namp = 0;
225
226 // Assume all primers for the same amplicon are adjacent in BED
227 // with all + followed by all -. Thus - to + signifies next primer set.
228 int last_rev = 0;
229 amp[0].max_left = 0;
230 amp[0].min_right = INT64_MAX;
231 amp[0].min_left = INT64_MAX;
232 amp[0].max_right = 0;
233 if (do_title) {
234 fprintf(ofp, "# Amplicon locations from BED file.\n");
235 fprintf(ofp, "# LEFT/RIGHT are <start>-<end> format and "
236 "comma-separated for alt-primers.\n");
237 if (args->multi_ref)
238 fprintf(ofp, "#\n# AMPLICON\tREF\tNUMBER\tLEFT\tRIGHT\n");
239 else
240 fprintf(ofp, "#\n# AMPLICON\tNUMBER\tLEFT\tRIGHT\n");
241 }
242 for (i = j = 0; i < sites->length; i++) {
243 if (i == 0 && sites->bp[i].rev != 0) {
244 fprintf(stderr, "[ampliconstats] error: BED file should start"
245 " with the + strand primer\n");
246 return -1;
247 }
248 if (sites->bp[i].rev == 0 && last_rev) {
249 j++;
250 if (j >= args->max_amp) {
251 fprintf(stderr, "[ampliconstats] error: too many amplicons"
252 " (%d). Use -a option to raise this.\n", j);
253 return -1;
254 }
255 amp[j].max_left = 0;
256 amp[j].min_right = INT64_MAX;
257 amp[j].min_left = INT64_MAX;
258 amp[j].max_right = 0;
259 }
260 if (sites->bp[i].rev == 0) {
261 if (i == 0 || last_rev) {
262 if (j>0) fprintf(ofp, "\n");
263 if (args->multi_ref)
264 fprintf(ofp, "AMPLICON\t%s\t%d", ref, j+1 + first_amp);
265 else
266 fprintf(ofp, "AMPLICON\t%d", j+1);
267 }
268 if (amp[j].nleft >= MAX_PRIMER_PER_AMPLICON) {
269 print_error_errno("ampliconstats",
270 "too many primers per amplicon (%d).\n",
271 MAX_PRIMER_PER_AMPLICON);
272 return -1;
273 }
274 amp[j].left[amp[j].nleft++] = sites->bp[i].right;
275 if (amp[j].max_left < sites->bp[i].right+1)
276 amp[j].max_left = sites->bp[i].right+1;
277 if (amp[j].min_left > sites->bp[i].right+1)
278 amp[j].min_left = sites->bp[i].right+1;
279 // BED file, so left+1 as zero based. right(+1-1) as
280 // BED goes one beyond end (and we want inclusive range).
281 fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nleft > 1],
282 sites->bp[i].left+1, sites->bp[i].right);
283 } else {
284 if (amp[j].nright >= MAX_PRIMER_PER_AMPLICON) {
285 print_error_errno("ampliconstats",
286 "too many primers per amplicon (%d)",
287 MAX_PRIMER_PER_AMPLICON);
288 return -1;
289 }
290 amp[j].right[amp[j].nright++] = sites->bp[i].left;
291 if (amp[j].min_right > sites->bp[i].left-1)
292 amp[j].min_right = sites->bp[i].left-1;
293 if (amp[j].max_right < sites->bp[i].left-1) {
294 amp[j].max_right = sites->bp[i].left-1;
295 if (amp[j].max_right - amp[j].min_left + 1 >=
296 args->max_amp_len) {
297 fprintf(stderr, "[ampliconstats] error: amplicon "
298 "longer (%d) than max_amp_len option (%d)\n",
299 (int)(amp[j].max_right - amp[j].min_left + 2),
300 args->max_amp_len);
301 return -1;
302 }
303 if (max_right < amp[j].max_right)
304 max_right = amp[j].max_right;
305 }
306 fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nright > 1],
307 sites->bp[i].left+1, sites->bp[i].right);
308 }
309 last_rev = sites->bp[i].rev;
310 }
311 if (last_rev != 1) {
312 fprintf(ofp, "\n"); // useful if going to stdout
313 fprintf(stderr, "[ampliconstats] error: bed file does not end on"
314 " a reverse strand primer.\n");
315 return -1;
316 }
317 *namp = ++j;
318 if (j) fprintf(ofp, "\n");
319
320 if (j >= args->max_amp) {
321 fprintf(stderr, "[ampliconstats] error: "
322 "too many amplicons (%d). Use -a option to raise this.", j);
323 return -1;
324 }
325
326 // for (i = 0; i < *namp; i++) {
327 // printf("%d\t%ld", i, amp[i].length);
328 // for (j = 0; j < amp[i].nleft; j++)
329 // printf("%c%ld", "\t,"[j>0], amp[i].left[j]);
330 // for (j = 0; j < amp[i].nright; j++)
331 // printf("%c%ld", "\t,"[j>0], amp[i].right[j]);
332 // printf("\n");
333 // }
334
335 return max_right;
336 }
337
stats_free(astats_t * st)338 void stats_free(astats_t *st) {
339 if (!st)
340 return;
341
342 free(st->nreads);
343 free(st->nreads2);
344 free(st->nfull_reads);
345 free(st->nrperc);
346 free(st->nrperc2);
347 free(st->nbases);
348 free(st->nbases2);
349 free(st->coverage);
350 free(st->covered_perc);
351 free(st->covered_perc2);
352 free(st->amp_dist);
353
354 free(st->depth_valid);
355 free(st->depth_all);
356
357 if (st->tcoord) {
358 int i;
359 for (i = 0; i <= st->max_amp; i++) {
360 if (st->tcoord[i])
361 kh_destroy(tcoord, st->tcoord[i]);
362 }
363 free(st->tcoord);
364 }
365
366 khiter_t k;
367 for (k = kh_begin(st->qend); k != kh_end(st->qend); k++)
368 if (kh_exist(st->qend, k))
369 free((void *)kh_key(st->qend, k));
370 kh_destroy(qname, st->qend);
371
372 free(st);
373 }
374
stats_alloc(int64_t max_len,int max_amp,int max_amp_len)375 astats_t *stats_alloc(int64_t max_len, int max_amp, int max_amp_len) {
376 astats_t *st = calloc(1, sizeof(*st));
377 if (!st)
378 return NULL;
379
380 st->max_amp = max_amp;
381 st->max_amp_len = max_amp_len;
382 st->max_len = max_len;
383
384 if (!(st->nreads = calloc(max_amp, sizeof(*st->nreads)))) goto err;
385 if (!(st->nreads2 = calloc(max_amp, sizeof(*st->nreads2)))) goto err;
386 if (!(st->nrperc = calloc(max_amp, sizeof(*st->nrperc)))) goto err;
387 if (!(st->nrperc2 = calloc(max_amp, sizeof(*st->nrperc2)))) goto err;
388 if (!(st->nbases = calloc(max_amp, sizeof(*st->nbases)))) goto err;
389 if (!(st->nbases2 = calloc(max_amp, sizeof(*st->nbases2)))) goto err;
390
391 if (!(st->nfull_reads = calloc(max_amp, sizeof(*st->nfull_reads))))
392 goto err;
393
394 if (!(st->coverage = calloc(max_amp*max_amp_len, sizeof(*st->coverage))))
395 goto err;
396
397 if (!(st->covered_perc = calloc(max_amp, sizeof(*st->covered_perc))))
398 goto err;
399 if (!(st->covered_perc2 = calloc(max_amp, sizeof(*st->covered_perc2))))
400 goto err;
401
402 if (!(st->tcoord = calloc(max_amp+1, sizeof(*st->tcoord)))) goto err;
403 int i;
404 for (i = 0; i <= st->max_amp; i++)
405 if (!(st->tcoord[i] = kh_init(tcoord)))
406 goto err;
407
408 if (!(st->qend = kh_init(qname)))
409 goto err;
410
411 if (!(st->depth_valid = calloc(max_len, sizeof(*st->depth_valid))))
412 goto err;
413 if (!(st->depth_all = calloc(max_len, sizeof(*st->depth_all))))
414 goto err;
415
416 if (!(st->amp_dist = calloc(max_amp, sizeof(*st->amp_dist)))) goto err;
417
418 return st;
419
420 err:
421 stats_free(st);
422 return NULL;
423 }
424
stats_reset(astats_t * st)425 static void stats_reset(astats_t *st) {
426 st->nseq = 0;
427 st->nfiltered = 0;
428 st->nfailprimer = 0;
429
430 memset(st->nreads, 0, st->max_amp * sizeof(*st->nreads));
431 memset(st->nreads2, 0, st->max_amp * sizeof(*st->nreads2));
432 memset(st->nfull_reads, 0, st->max_amp * sizeof(*st->nfull_reads));
433
434 memset(st->nrperc, 0, st->max_amp * sizeof(*st->nrperc));
435 memset(st->nrperc2, 0, st->max_amp * sizeof(*st->nrperc2));
436
437 memset(st->nbases, 0, st->max_amp * sizeof(*st->nbases));
438 memset(st->nbases2, 0, st->max_amp * sizeof(*st->nbases2));
439
440 memset(st->coverage, 0, st->max_amp * st->max_amp_len
441 * sizeof(*st->coverage));
442 memset(st->covered_perc, 0, st->max_amp * sizeof(*st->covered_perc));
443 memset(st->covered_perc2, 0, st->max_amp * sizeof(*st->covered_perc2));
444
445 // Keep the allocated entries as it's likely all files will share
446 // the same keys. Instead we reset counters to zero for common ones
447 // and delete rare ones.
448 int i;
449 for (i = 0; i <= st->max_amp; i++) {
450 khiter_t k;
451 for (k = kh_begin(st->tcoord[i]);
452 k != kh_end(st->tcoord[i]); k++)
453 if (kh_exist(st->tcoord[i], k)) {
454 if (kh_value(st->tcoord[i], k) < 5)
455 kh_del(tcoord, st->tcoord[i], k);
456 else
457 kh_value(st->tcoord[i], k) = 0;
458 }
459 }
460
461 khiter_t k;
462 for (k = kh_begin(st->qend); k != kh_end(st->qend); k++)
463 if (kh_exist(st->qend, k))
464 free((void *)kh_key(st->qend, k));
465 kh_clear(qname, st->qend);
466
467 memset(st->depth_valid, 0, st->max_len * sizeof(*st->depth_valid));
468 memset(st->depth_all, 0, st->max_len * sizeof(*st->depth_all));
469 memset(st->amp_dist, 0, st->max_amp * sizeof(*st->amp_dist));
470 }
471
amp_stats_reset(amplicons_t * amps,int nref)472 static void amp_stats_reset(amplicons_t *amps, int nref) {
473 int i;
474 for (i = 0; i < nref; i++) {
475 if (!amps[i].sites)
476 continue;
477 stats_reset(amps[i].lstats);
478 }
479 }
480
accumulate_stats(astats_args_t * args,amplicons_t * amps,bam1_t * b)481 static int accumulate_stats(astats_args_t *args, amplicons_t *amps,
482 bam1_t *b) {
483 int ref = b->core.tid;
484 amplicon_t *amp = amps[ref].amp;
485 astats_t *stats = amps[ref].lstats;
486 int len = amps[ref].len;
487
488 if (!stats)
489 return 0;
490
491 stats->nseq++;
492 if ((b->core.flag & args->flag_require) != args->flag_require ||
493 (b->core.flag & args->flag_filter) != 0) {
494 stats->nfiltered++;
495 return 0;
496 }
497
498 int64_t start = b->core.pos, mstart = start; // modified start
499 int64_t end = bam_endpos(b), i;
500
501 // Compute all-template-depth and valid-template-depth.
502 // We track current end location per read name so we can remove overlaps.
503 // Potentially we could use this data for a better amplicon-depth
504 // count too, but for now it's purely for the per-base plots.
505 int ret;
506 khiter_t k;
507 int prev_start = 0, prev_end = 0;
508 if ((b->core.flag & BAM_FPAIRED)
509 && !(b->core.flag & (BAM_FSUPPLEMENTARY | BAM_FSECONDARY))) {
510 k = kh_put(qname, stats->qend, bam_get_qname(b), &ret);
511 if (ret == 0) {
512 prev_start = kh_value(stats->qend, k) & 0xffffffff;
513 prev_end = kh_value(stats->qend, k)>>32;
514 mstart = MAX(mstart, prev_end);
515 // Ideally we'd reuse strings so we don't thrash free/malloc.
516 // However let's see if the official way of doing that (malloc
517 // itself) is fast enough first.
518 free((void *)kh_key(stats->qend, k));
519 kh_del(qname, stats->qend, k);
520 //fprintf(stderr, "remove overlap %d to %d\n", (int)start, (int)mstart);
521 } else {
522 if (!(kh_key(stats->qend, k) = strdup(bam_get_qname(b))))
523 return -1;
524
525 kh_value(stats->qend, k) = start | (end << 32);
526 }
527 }
528 for (i = mstart; i < end && i < len; i++)
529 stats->depth_all[i]++;
530 if (i < end) {
531 print_error("ampliconstats", "record %s overhangs end of reference",
532 bam_get_qname(b));
533 // But keep going, as it's harmless.
534 }
535
536 // On single ended runs, eg ONT or PacBio, we just use the start/end
537 // of the template to assign.
538 int anum = (b->core.flag & BAM_FREVERSE) || !(b->core.flag & BAM_FPAIRED)
539 ? (end-1 >= 0 && end-1 < len ? pos2end[end-1] : -1)
540 : (start >= 0 && start < len ? pos2start[start] : -1);
541
542 // ivar sometimes soft-clips 100% of the bases.
543 // This is essentially unmapped
544 if (end == start && (args->flag_filter & BAM_FUNMAP)) {
545 stats->nfiltered++;
546 return 0;
547 }
548
549 if (anum == -1)
550 stats->nfailprimer++;
551
552 if (anum >= 0) {
553 int64_t c = MIN(end,amp[anum].min_right+1) - MAX(start,amp[anum].max_left);
554 if (c > 0) {
555 stats->nreads[anum]++;
556 // NB: ref bases rather than read bases
557 stats->nbases[anum] += c;
558
559 int64_t i;
560 if (start < 0) start = 0;
561 if (end > len) end = len;
562
563 int64_t ostart = MAX(start, amp[anum].min_left-1);
564 int64_t oend = MIN(end, amp[anum].max_right);
565 int64_t offset = amp[anum].min_left-1;
566 for (i = ostart; i < oend; i++)
567 stats->coverage[anum*stats->max_amp_len + i-offset]++;
568 } else {
569 stats->nfailprimer++;
570 }
571 }
572
573 // Template length in terms of amplicon number to amplicon number.
574 // We expect left to right of same amplicon (len 0), but it may go
575 // to next amplicon (len 1) or prev (len -1), etc.
576 int64_t t_end;
577 int oth_anum = -1;
578
579 if (b->core.flag & BAM_FPAIRED) {
580 t_end = (b->core.flag & BAM_FREVERSE ? end : start)
581 + b->core.isize;
582
583 // If we've clipped the primers but not followed up with a fixmates
584 // then our start+TLEN will take us to a location which is
585 // length(LEFT_PRIMER) + length(RIGHT_PRIMER) too far away.
586 //
587 // The correct solution is to run samtools fixmate so TLEN is correct.
588 // The hacky solution is to fudge the expected tlen by double the
589 // average primer length (e.g. 50).
590 t_end += b->core.isize > 0 ? -args->tlen_adj : +args->tlen_adj;
591
592 if (t_end > 0 && t_end < len && b->core.isize != 0)
593 oth_anum = (b->core.flag & BAM_FREVERSE)
594 ? pos2start[t_end]
595 : pos2end[t_end];
596 } else {
597 // Not paired (see int anum = (REV || !PAIR) ?en :st expr above)
598 oth_anum = pos2start[start];
599 t_end = end;
600 }
601
602 // We don't want to count our pairs twice.
603 // If both left/right are known, count it on left only.
604 // If only one is known, we'll only get to this code once
605 // so we can also count it.
606 int astatus = 2;
607 if (anum != -1 && oth_anum != -1) {
608 astatus = oth_anum == anum ? 0 : 1;
609 if (start <= t_end)
610 stats->amp_dist[anum][astatus]++;
611 } else if (anum >= 0) {
612 stats->amp_dist[anum][astatus = 2]++;
613 }
614
615 if (astatus == 0 && !(b->core.flag & (BAM_FUNMAP | BAM_FMUNMAP))) {
616 if (prev_end && mstart > prev_end) {
617 // 2nd read with gap to 1st; undo previous increment.
618 for (i = prev_start; i < prev_end; i++)
619 stats->depth_valid[i]--;
620 stats->nfull_reads[anum] -= (b->core.flag & BAM_FPAIRED) ? 0.5 : 1;
621 } else {
622 // 1st read, or 2nd read that overlaps 1st
623 for (i = mstart; i < end; i++)
624 stats->depth_valid[i]++;
625 stats->nfull_reads[anum] += (b->core.flag & BAM_FPAIRED) ? 0.5 : 1;
626 }
627 }
628
629 // Track template start,end frequencies, so we can give stats on
630 // amplicon primer usage.
631 if ((b->core.flag & BAM_FPAIRED) && b->core.isize <= 0)
632 // left to right only, so we don't double count template positions.
633 return 0;
634
635 start = b->core.pos;
636 t_end = b->core.flag & BAM_FPAIRED
637 ? start + b->core.isize-1
638 : end;
639 uint64_t tcoord = MIN(start+1, UINT32_MAX) | (MIN(t_end+1, UINT32_MAX)<<32);
640 k = kh_put(tcoord, stats->tcoord[anum+1], tcoord, &ret);
641 if (ret < 0)
642 return -1;
643 if (ret == 0)
644 kh_value(stats->tcoord[anum+1], k)++;
645 else
646 kh_value(stats->tcoord[anum+1], k)=1;
647 kh_value(stats->tcoord[anum+1], k) |= ((int64_t)astatus<<32);
648
649 return 0;
650 }
651
652 // Append file local stats to global stats
append_lstats(astats_t * lstats,astats_t * gstats,int namp,int all_nseq)653 int append_lstats(astats_t *lstats, astats_t *gstats, int namp, int all_nseq) {
654 gstats->nseq += lstats->nseq;
655 gstats->nfiltered += lstats->nfiltered;
656 gstats->nfailprimer += lstats->nfailprimer;
657
658 int a;
659 for (a = -1; a < namp; a++) {
660 // Add khash local (kl) to khash global (kg)
661 khiter_t kl, kg;
662 for (kl = kh_begin(lstats->tcoord[a+1]);
663 kl != kh_end(lstats->tcoord[a+1]); kl++) {
664 if (!kh_exist(lstats->tcoord[a+1], kl) ||
665 kh_value(lstats->tcoord[a+1], kl) == 0)
666 continue;
667
668 int ret;
669 kg = kh_put(tcoord, gstats->tcoord[a+1],
670 kh_key(lstats->tcoord[a+1], kl),
671 &ret);
672 if (ret < 0)
673 return -1;
674
675 kh_value(gstats->tcoord[a+1], kg) =
676 (ret == 0
677 ? (kh_value(gstats->tcoord[a+1], kg) & 0xFFFFFFFF)
678 : 0)
679 + kh_value(lstats->tcoord[a+1], kl);
680 }
681 if (a == -1) continue;
682
683 gstats->nreads[a] += lstats->nreads[a];
684 gstats->nreads2[a] += lstats->nreads[a] * lstats->nreads[a];
685 gstats->nfull_reads[a] += lstats->nfull_reads[a];
686
687 // To get mean & sd for amplicon read percentage, we need
688 // to do the divisions here as nseq differs for each sample.
689 double nrperc = all_nseq ? 100.0 * lstats->nreads[a] / all_nseq : 0;
690 gstats->nrperc[a] += nrperc;
691 gstats->nrperc2[a] += nrperc*nrperc;
692
693 gstats->nbases[a] += lstats->nbases[a];
694 gstats->nbases2[a] += lstats->nbases[a] * lstats->nbases[a];
695
696 int d;
697 for (d = 0; d < MAX_DEPTH; d++) {
698 gstats->covered_perc[a][d] += lstats->covered_perc[a][d];
699 gstats->covered_perc2[a][d] += lstats->covered_perc[a][d]
700 * lstats->covered_perc[a][d];
701 }
702
703 for (d = 0; d < 3; d++)
704 gstats->amp_dist[a][d] += lstats->amp_dist[a][d];
705 }
706
707 for (a = 0; a < lstats->max_len; a++) {
708 gstats->depth_valid[a] += lstats->depth_valid[a];
709 gstats->depth_all[a] += lstats->depth_all[a];
710 }
711
712 return 0;
713 }
714
append_stats(amplicons_t * amps,int nref)715 int append_stats(amplicons_t *amps, int nref) {
716 int i, r, all_nseq = 0;
717 for (r = 0; r < nref; r++) {
718 if (!amps[r].sites)
719 continue;
720 astats_t *stats = amps[r].lstats;
721 all_nseq += stats->nseq - stats->nfiltered - stats->nfailprimer;
722 }
723
724 for (i = 0; i < nref; i++) {
725 if (!amps[i].sites)
726 continue;
727 if (append_lstats(amps[i].lstats, amps[i].gstats, amps[i].namp,
728 all_nseq) < 0)
729 return -1;
730 }
731
732 return 0;
733 }
734
735 typedef struct {
736 int32_t start, end;
737 uint32_t freq;
738 uint32_t status;
739 } tcoord_t;
740
741 // Sort tcoord by descending frequency and then ascending start and end.
tcoord_freq_sort(const void * vp1,const void * vp2)742 static int tcoord_freq_sort(const void *vp1, const void *vp2) {
743 const tcoord_t *t1 = (const tcoord_t *)vp1;
744 const tcoord_t *t2 = (const tcoord_t *)vp2;
745
746 if (t1->freq != t2->freq)
747 return t2->freq - t1->freq;
748
749 if (t1->start != t2->start)
750 return t1->start - t2->start;
751
752 return t1->end - t2->end;
753 }
754
755
756 /*
757 * Merges tcoord start,end,freq,status tuples if their coordinates are
758 * close together. We aim to keep the start,end for the most frequent
759 * value and assume that is the correct coordinate and all others are
760 * minor fluctuations due to errors or variants.
761 *
762 * We sort by frequency first and then merge later items in the list into
763 * the earlier more frequent ones. It's O(N^2), but sufficient for now
764 * given current scale of projects.
765 *
766 * If we ever need to resolve that then consider sorting by start
767 * coordinate and scanning the list to find all items within X, find
768 * the most frequent of those, and then cluster that way. (I'd have
769 * done that had I thought of it at the time!)
770 */
aggregate_tcoord(astats_args_t * args,tcoord_t * tpos,size_t * np)771 static void aggregate_tcoord(astats_args_t *args, tcoord_t *tpos, size_t *np){
772 size_t n = *np, j, j2, j3, k;
773
774 // Sort by frequency and cluster infrequent coords into frequent
775 // ones provided they're close by.
776 // This is O(N^2), but we've already binned by tcoord_bin/2 so
777 // the list isn't intended to be vast at this point.
778 qsort(tpos, n, sizeof(*tpos), tcoord_freq_sort);
779
780 // For frequency ties, find mid start coord, and then find mid end
781 // coord of those matching start.
782 // We make that the first item so we merge into that mid point.
783 for (j = 0; j < n; j++) {
784 for (j2 = j+1; j2 < n; j2++) {
785 if (tpos[j].freq != tpos[j2].freq)
786 break;
787 if (tpos[j2].start - tpos[j].start >= args->tcoord_bin)
788 break;
789 }
790
791 // j to j2 all within bin of a common start,
792 // m is the mid start.
793 if (j2-1 > j) {
794 size_t m = (j2-1 + j)/2;
795
796 // Find mid end for this same start
797 while (m > 1 && tpos[m].start == tpos[m-1].start)
798 m--;
799 for (j3 = m+1; j3 < j2; j3++) {
800 if (tpos[m].start != tpos[j3].start)
801 break;
802 if (tpos[m].end - tpos[j3].end >= args->tcoord_bin)
803 break;
804 }
805 if (j3-1 > m)
806 m = (j3-1 + m)/2;
807
808 // Swap with first item.
809 tcoord_t tmp = tpos[j];
810 tpos[j] = tpos[m];
811 tpos[m] = tmp;
812 j = j2-1;
813 }
814 }
815
816 // Now merge in coordinates.
817 // This bit is O(N^2), so consider binning first to reduce the
818 // size of the list if we have excessive positional variation.
819 for (k = j = 0; j < n; j++) {
820 if (!tpos[j].freq)
821 continue;
822
823 if (k < j)
824 tpos[k] = tpos[j];
825
826 for (j2 = j+1; j2 < n; j2++) {
827 if (ABS(tpos[j].start-tpos[j2].start) < args->tcoord_bin/2 &&
828 ABS(tpos[j].end -tpos[j2].end) < args->tcoord_bin/2 &&
829 tpos[j].status == tpos[j2].status) {
830 tpos[k].freq += tpos[j2].freq;
831 tpos[j2].freq = 0;
832 }
833 }
834 k++;
835 }
836
837 *np = k;
838 }
839
dump_stats(astats_args_t * args,char type,char * name,int nfile,amplicons_t * amps,int nref,int local)840 int dump_stats(astats_args_t *args, char type, char *name, int nfile,
841 amplicons_t *amps, int nref, int local) {
842 int i, r;
843 FILE *ofp = args->out_fp;
844 tcoord_t *tpos = NULL;
845 size_t ntcoord = 0;
846
847 // summary stats for this sample (or for all samples)
848 fprintf(ofp, "# Summary stats.\n");
849 fprintf(ofp, "# Use 'grep ^%cSS | cut -f 2-' to extract this part.\n", type);
850
851 for (r = 0; r < nref; r++) {
852 if (!amps[r].sites)
853 continue;
854 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
855 int nmatch = stats->nseq - stats->nfiltered - stats->nfailprimer;
856 char *name_ref = malloc(strlen(name) + strlen(amps[r].ref) + 2);
857 if (!name_ref)
858 return -1;
859 if (args->multi_ref)
860 sprintf(name_ref, "%s\t%s", name, amps[r].ref);
861 else
862 sprintf(name_ref, "%s", name);
863 fprintf(ofp, "%cSS\t%s\traw total sequences:\t%d\n",
864 type, name_ref, stats->nseq);
865 fprintf(ofp, "%cSS\t%s\tfiltered sequences:\t%d\n",
866 type, name_ref, stats->nfiltered);
867 fprintf(ofp, "%cSS\t%s\tfailed primer match:\t%d\n",
868 type, name_ref, stats->nfailprimer);
869 fprintf(ofp, "%cSS\t%s\tmatching sequences:\t%d\n",
870 type, name_ref, nmatch);
871
872 int d = 0;
873 do {
874 // From first to last amplicon only, so not entire consensus.
875 // If contig length is known, maybe we want to add the missing
876 // count to < DEPTH figures?
877 int64_t start = 0, covered = 0, total = 0;
878 amplicon_t *amp = amps[r].amp;
879 for (i = 0; i < amps[r].namp; i++) {
880 int64_t j, offset = amp[i].min_left-1;
881 if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) {
882 fprintf(stderr, "[ampliconstats] error: "
883 "Maximum amplicon length (%d) exceeded for '%s'\n",
884 stats->max_amp, name);
885 return -1;
886 }
887 for (j = MAX(start, amp[i].max_left-1);
888 j < MAX(start, amp[i].min_right); j++) {
889 if (stats->coverage[i*stats->max_amp_len + j-offset]
890 >= args->min_depth[d])
891 covered++;
892 total++;
893 }
894 start = MAX(start, amp[i].min_right);
895 }
896 fprintf(ofp, "%cSS\t%s\tconsensus depth count < %d and >= %d:\t%"
897 PRId64"\t%"PRId64"\n", type, name_ref,
898 args->min_depth[d], args->min_depth[d],
899 total-covered, covered);
900 } while (++d < MAX_DEPTH && args->min_depth[d]);
901
902 free(name_ref);
903 }
904
905 // Read count
906 fprintf(ofp, "# Absolute matching read counts per amplicon.\n");
907 fprintf(ofp, "# Use 'grep ^%cREADS | cut -f 2-' to extract this part.\n", type);
908 fprintf(ofp, "%cREADS\t%s", type, name);
909 for (r = 0; r < nref; r++) {
910 if (!amps[r].sites)
911 continue;
912 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
913 for (i = 0; i < amps[r].namp; i++) {
914 fprintf(ofp, "\t%"PRId64, stats->nreads[i]);
915 }
916 }
917 fprintf(ofp, "\n");
918
919 // Valid depth is the number of full length reads (already divided
920 // by the number we expect to cover), so +0.5 per read in pair.
921 // A.k.a "usable depth" in the plots.
922 fprintf(ofp, "%cVDEPTH\t%s", type, name);
923 for (r = 0; r < nref; r++) {
924 if (!amps[r].sites)
925 continue;
926 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
927 for (i = 0; i < amps[r].namp; i++)
928 fprintf(ofp, "\t%d", (int)stats->nfull_reads[i]);
929 }
930 fprintf(ofp, "\n");
931
932 if (type == 'C') {
933 // For combined we can compute mean & standard deviation too
934 fprintf(ofp, "CREADS\tMEAN");
935 for (r = 0; r < nref; r++) {
936 if (!amps[r].sites)
937 continue;
938 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
939 for (i = 0; i < amps[r].namp; i++) {
940 fprintf(ofp, "\t%.1f", stats->nreads[i] / (double)nfile);
941 }
942 }
943 fprintf(ofp, "\n");
944
945 fprintf(ofp, "CREADS\tSTDDEV");
946 for (r = 0; r < nref; r++) {
947 if (!amps[r].sites)
948 continue;
949 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
950 for (i = 0; i < amps[r].namp; i++) {
951 double n1 = stats->nreads[i];
952 fprintf(ofp, "\t%.1f", nfile > 1 && stats->nreads2[i] > 0
953 ? sqrt(stats->nreads2[i]/(double)nfile
954 - (n1/nfile)*(n1/nfile))
955 : 0);
956 }
957 }
958 fprintf(ofp, "\n");
959 }
960
961 fprintf(ofp, "# Read percentage of distribution between amplicons.\n");
962 fprintf(ofp, "# Use 'grep ^%cRPERC | cut -f 2-' to extract this part.\n", type);
963 fprintf(ofp, "%cRPERC\t%s", type, name);
964 int all_nseq = 0;
965 for (r = 0; r < nref; r++) {
966 if (!amps[r].sites)
967 continue;
968 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
969 all_nseq += stats->nseq - stats->nfiltered - stats->nfailprimer;
970 }
971 for (r = 0; r < nref; r++) {
972 if (!amps[r].sites)
973 continue;
974 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
975 for (i = 0; i < amps[r].namp; i++) {
976 if (type == 'C') {
977 fprintf(ofp, "\t%.3f", (double)stats->nrperc[i] / nfile);
978 } else {
979 fprintf(ofp, "\t%.3f",
980 all_nseq ? 100.0 * stats->nreads[i] / all_nseq : 0);
981 }
982 }
983 }
984 fprintf(ofp, "\n");
985
986 if (type == 'C') {
987 // For combined we compute mean and standard deviation too
988 fprintf(ofp, "CRPERC\tMEAN");
989 for (r = 0; r < nref; r++) {
990 if (!amps[r].sites)
991 continue;
992 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
993 for (i = 0; i < amps[r].namp; i++) {
994 fprintf(ofp, "\t%.3f", stats->nrperc[i] / nfile);
995 }
996 }
997 fprintf(ofp, "\n");
998
999 fprintf(ofp, "CRPERC\tSTDDEV");
1000 for (r = 0; r < nref; r++) {
1001 if (!amps[r].sites)
1002 continue;
1003 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1004 for (i = 0; i < amps[r].namp; i++) {
1005 // variance = SUM(X^2) - ((SUM(X)^2) / N)
1006 double n1 = stats->nrperc[i];
1007 double v = stats->nrperc2[i]/nfile - (n1/nfile)*(n1/nfile);
1008 fprintf(ofp, "\t%.3f", v>0?sqrt(v):0);
1009 }
1010 }
1011 fprintf(ofp, "\n");
1012 }
1013
1014 // Base depth
1015 fprintf(ofp, "# Read depth per amplicon.\n");
1016 fprintf(ofp, "# Use 'grep ^%cDEPTH | cut -f 2-' to extract this part.\n", type);
1017 fprintf(ofp, "%cDEPTH\t%s", type, name);
1018 for (r = 0; r < nref; r++) {
1019 if (!amps[r].sites)
1020 continue;
1021 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1022 amplicon_t *amp = amps[r].amp;
1023 for (i = 0; i < amps[r].namp; i++) {
1024 int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer;
1025 int64_t alen = amp[i].min_right - amp[i].max_left+1;
1026 fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen : 0);
1027 }
1028 }
1029 fprintf(ofp, "\n");
1030
1031 if (type == 'C') {
1032 // For combined we can compute mean & standard deviation too
1033 fprintf(ofp, "CDEPTH\tMEAN");
1034 for (r = 0; r < nref; r++) {
1035 if (!amps[r].sites)
1036 continue;
1037 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1038 amplicon_t *amp = amps[r].amp;
1039 int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer;
1040 for (i = 0; i < amps[r].namp; i++) {
1041 int64_t alen = amp[i].min_right - amp[i].max_left+1;
1042 fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen / nfile : 0);
1043 }
1044 }
1045 fprintf(ofp, "\n");
1046
1047 fprintf(ofp, "CDEPTH\tSTDDEV");
1048 for (r = 0; r < nref; r++) {
1049 if (!amps[r].sites)
1050 continue;
1051 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1052 amplicon_t *amp = amps[r].amp;
1053 for (i = 0; i < amps[r].namp; i++) {
1054 double alen = amp[i].min_right - amp[i].max_left+1;
1055 double n1 = stats->nbases[i] / alen;
1056 double v = stats->nbases2[i] / (alen*alen) /nfile
1057 - (n1/nfile)*(n1/nfile);
1058 fprintf(ofp, "\t%.1f", v>0?sqrt(v):0);
1059 }
1060 }
1061 fprintf(ofp, "\n");
1062 }
1063
1064 // Percent Coverage
1065 if (type == 'F') {
1066 fprintf(ofp, "# Percentage coverage per amplicon\n");
1067 fprintf(ofp, "# Use 'grep ^%cPCOV | cut -f 2-' to extract this part.\n", type);
1068 int d = 0;
1069 do {
1070 fprintf(ofp, "%cPCOV-%d\t%s", type, args->min_depth[d], name);
1071
1072 for (r = 0; r < nref; r++) {
1073 if (!amps[r].sites)
1074 continue;
1075 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1076 amplicon_t *amp = amps[r].amp;
1077 for (i = 0; i < amps[r].namp; i++) {
1078 int covered = 0;
1079 if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) {
1080 fprintf(stderr, "[ampliconstats] error: "
1081 "Maximum amplicon length (%d) exceeded for '%s'\n",
1082 stats->max_amp, name);
1083 return -1;
1084 }
1085 int64_t j, offset = amp[i].min_left-1;
1086 for (j = amp[i].max_left-1; j < amp[i].min_right; j++) {
1087 int apos = i*stats->max_amp_len + j-offset;
1088 if (stats->coverage[apos] >= args->min_depth[d])
1089 covered++;
1090 }
1091 int64_t alen = amp[i].min_right - amp[i].max_left+1;
1092 stats->covered_perc[i][d] = 100.0 * covered / alen;
1093 fprintf(ofp, "\t%.2f", 100.0 * covered / alen);
1094 }
1095 }
1096 fprintf(ofp, "\n");
1097 } while (++d < MAX_DEPTH && args->min_depth[d]);
1098
1099 } else if (type == 'C') {
1100 // For combined we can compute mean & standard deviation too
1101 int d = 0;
1102 do {
1103 fprintf(ofp, "CPCOV-%d\tMEAN", args->min_depth[d]);
1104 for (r = 0; r < nref; r++) {
1105 if (!amps[r].sites)
1106 continue;
1107 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1108 for (i = 0; i < amps[r].namp; i++) {
1109 fprintf(ofp, "\t%.1f", stats->covered_perc[i][d] / nfile);
1110 }
1111 }
1112 fprintf(ofp, "\n");
1113
1114 fprintf(ofp, "CPCOV-%d\tSTDDEV", args->min_depth[d]);
1115 for (r = 0; r < nref; r++) {
1116 if (!amps[r].sites)
1117 continue;
1118 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1119 for (i = 0; i < amps[r].namp; i++) {
1120 double n1 = stats->covered_perc[i][d] / nfile;
1121 double v = stats->covered_perc2[i][d] / nfile - n1*n1;
1122 fprintf(ofp, "\t%.1f", v>0?sqrt(v):0);
1123 }
1124 }
1125 fprintf(ofp, "\n");
1126 } while (++d < MAX_DEPTH && args->min_depth[d]);
1127 }
1128
1129 // Plus base depth for all reads, irrespective of amplicon.
1130 // This is post overlap removal, if reads in the read-pair overlap.
1131 fprintf(ofp, "# Depth per reference base for ALL data.\n");
1132 fprintf(ofp, "# Use 'grep ^%cDP_ALL | cut -f 2-' to extract this part.\n",
1133 type);
1134 for (r = 0; r < nref; r++) {
1135 if (!amps[r].sites)
1136 continue;
1137 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1138 if (args->multi_ref)
1139 fprintf(ofp, "%cDP_ALL\t%s\t%s", type, name, amps[r].ref);
1140 else
1141 fprintf(ofp, "%cDP_ALL\t%s", type, name);
1142
1143 for (i = 0; i < amps[r].len; i++) {
1144 // Basic run-length encoding provided all values are within
1145 // +- depth_bin fraction of the mid-point.
1146 int dmin = stats->depth_all[i], dmax = stats->depth_all[i], j;
1147 double dmid = (dmin + dmax)/2.0;
1148 double low = dmid*(1-args->depth_bin);
1149 double high = dmid*(1+args->depth_bin);
1150 for (j = i+1; j < amps[r].len; j++) {
1151 int d = stats->depth_all[j];
1152 if (d < low || d > high)
1153 break;
1154 if (dmin > d) {
1155 dmin = d;
1156 dmid = (dmin + dmax)/2.0;
1157 low = dmid*(1-args->depth_bin);
1158 high = dmid*(1+args->depth_bin);
1159 } else if (dmax < d) {
1160 dmax = d;
1161 dmid = (dmin + dmax)/2.0;
1162 low = dmid*(1-args->depth_bin);
1163 high = dmid*(1+args->depth_bin);
1164 }
1165 }
1166 fprintf(ofp, "\t%d,%d", (int)dmid, j-i);
1167 i = j-1;
1168 }
1169 fprintf(ofp, "\n");
1170 }
1171
1172 // And depth for only reads matching to a single amplicon for full
1173 // length. This is post read overlap removal.
1174 fprintf(ofp, "# Depth per reference base for full-length valid amplicon data.\n");
1175 fprintf(ofp, "# Use 'grep ^%cDP_VALID | cut -f 2-' to extract this "
1176 "part.\n", type);
1177 for (r = 0; r < nref; r++) {
1178 if (!amps[r].sites)
1179 continue;
1180 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1181 if (args->multi_ref)
1182 fprintf(ofp, "%cDP_VALID\t%s\t%s", type, name, amps[r].ref);
1183 else
1184 fprintf(ofp, "%cDP_VALID\t%s", type, name);
1185
1186 for (i = 0; i < amps[r].len; i++) {
1187 int dmin = stats->depth_valid[i], dmax = stats->depth_valid[i], j;
1188 double dmid = (dmin + dmax)/2.0;
1189 double low = dmid*(1-args->depth_bin);
1190 double high = dmid*(1+args->depth_bin);
1191 for (j = i+1; j < amps[r].len; j++) {
1192 int d = stats->depth_valid[j];
1193 if (d < low || d > high)
1194 break;
1195 if (dmin > d) {
1196 dmin = d;
1197 dmid = (dmin + dmax)/2.0;
1198 low = dmid*(1-args->depth_bin);
1199 high = dmid*(1+args->depth_bin);
1200 } else if (dmax < d) {
1201 dmax = d;
1202 dmid = (dmin + dmax)/2.0;
1203 low = dmid*(1-args->depth_bin);
1204 high = dmid*(1+args->depth_bin);
1205 }
1206 }
1207 fprintf(ofp, "\t%d,%d", (int)dmid, j-i);
1208 i = j-1;
1209 }
1210 fprintf(ofp, "\n");
1211 }
1212
1213 // TCOORD (start to end) distribution
1214 fprintf(ofp, "# Distribution of aligned template coordinates.\n");
1215 fprintf(ofp, "# Use 'grep ^%cTCOORD | cut -f 2-' to extract this part.\n", type);
1216 for (r = 0; r < nref; r++) {
1217 if (!amps[r].sites)
1218 continue;
1219 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1220 for (i = 0 - (nref==1); i < amps[r].namp; i++) {
1221 if (ntcoord < kh_size(stats->tcoord[i+1])) {
1222 ntcoord = kh_size(stats->tcoord[i+1]);
1223 tcoord_t *tmp = realloc(tpos, ntcoord * sizeof(*tmp));
1224 if (!tmp) {
1225 free(tpos);
1226 return -1;
1227 }
1228 tpos = tmp;
1229 }
1230
1231 khiter_t k;
1232 size_t n = 0, j;
1233 for (k = kh_begin(stats->tcoord[i+1]);
1234 k != kh_end(stats->tcoord[i+1]); k++) {
1235 if (!kh_exist(stats->tcoord[i+1], k) ||
1236 (kh_value(stats->tcoord[i+1], k) & 0xFFFFFFFF) == 0)
1237 continue;
1238 // Key is start,end in 32-bit quantities.
1239 // Yes this limits us to 4Gb references, but just how
1240 // many primers are we planning on making? Not that many
1241 // I hope.
1242 tpos[n].start = kh_key(stats->tcoord[i+1], k)&0xffffffff;
1243 tpos[n].end = kh_key(stats->tcoord[i+1], k)>>32;
1244
1245 // Value is frequency (top 32-bits) and status (bottom 32).
1246 tpos[n].freq = kh_value(stats->tcoord[i+1], k)&0xffffffff;
1247 tpos[n].status = kh_value(stats->tcoord[i+1], k)>>32;
1248 n++;
1249 }
1250
1251 if (args->tcoord_bin > 1)
1252 aggregate_tcoord(args, tpos, &n);
1253
1254 fprintf(ofp, "%cTCOORD\t%s\t%d", type, name,
1255 i+1+amps[r].first_amp); // per amplicon
1256 for (j = 0; j < n; j++) {
1257 if (tpos[j].freq < args->tcoord_min_count)
1258 continue;
1259 fprintf(ofp, "\t%d,%d,%u,%u",
1260 tpos[j].start,
1261 tpos[j].end,
1262 tpos[j].freq,
1263 tpos[j].status);
1264 }
1265 fprintf(ofp, "\n");
1266 }
1267 }
1268
1269
1270 // AMP length distribution.
1271 // 0 = both ends in this amplicon
1272 // 1 = ends in different amplicons
1273 // 2 = other end matching an unknown amplicon site
1274 // (see tcoord for further analysis of where)
1275 fprintf(ofp, "# Classification of amplicon status. Columns are\n");
1276 fprintf(ofp, "# number with both primers from this amplicon, number with\n");
1277 fprintf(ofp, "# primers from different amplicon, and number with a position\n");
1278 fprintf(ofp, "# not matching any valid amplicon primer site\n");
1279 fprintf(ofp, "# Use 'grep ^%cAMP | cut -f 2-' to extract this part.\n", type);
1280
1281 fprintf(ofp, "%cAMP\t%s\t0", type, name); // all merged
1282 int amp_dist[3] = {0};
1283 for (r = 0; r < nref; r++) {
1284 if (!amps[r].sites)
1285 continue;
1286 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1287 for (i = 0; i < amps[r].namp; i++) { // accumulate for all amps
1288 amp_dist[0] += stats->amp_dist[i][0];
1289 amp_dist[1] += stats->amp_dist[i][1];
1290 amp_dist[2] += stats->amp_dist[i][2];
1291 }
1292 }
1293 fprintf(ofp, "\t%d\t%d\t%d\n", amp_dist[0], amp_dist[1], amp_dist[2]);
1294
1295 for (r = 0; r < nref; r++) {
1296 if (!amps[r].sites)
1297 continue;
1298 astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
1299 for (i = 0; i < amps[r].namp; i++) {
1300 // per amplicon
1301 fprintf(ofp, "%cAMP\t%s\t%d", type, name, i+1+amps[r].first_amp);
1302 fprintf(ofp, "\t%d\t%d\t%d\n", stats->amp_dist[i][0],
1303 stats->amp_dist[i][1], stats->amp_dist[i][2]);
1304 }
1305 }
1306
1307 free(tpos);
1308 return 0;
1309 }
1310
dump_lstats(astats_args_t * args,char type,char * name,int nfile,amplicons_t * amps,int nref)1311 int dump_lstats(astats_args_t *args, char type, char *name, int nfile,
1312 amplicons_t *amps, int nref) {
1313 return dump_stats(args, type, name, nfile, amps, nref, 1);
1314 }
1315
dump_gstats(astats_args_t * args,char type,char * name,int nfile,amplicons_t * amps,int nref)1316 int dump_gstats(astats_args_t *args, char type, char *name, int nfile,
1317 amplicons_t *amps, int nref) {
1318 return dump_stats(args, type, name, nfile, amps, nref, 0);
1319 }
1320
get_sample_name(sam_hdr_t * header,char * RG)1321 char const *get_sample_name(sam_hdr_t *header, char *RG) {
1322 kstring_t ks = {0};
1323 sam_hdr_find_tag_id(header, "RG", RG?"ID":NULL, RG, "SM", &ks);
1324 return ks.s;
1325 }
1326
1327 // Return maximum reference length (SQ is NULL) or the length
1328 // of the specified reference in SQ.
get_ref_len(sam_hdr_t * header,const char * SQ)1329 int64_t get_ref_len(sam_hdr_t *header, const char *SQ) {
1330 if (SQ) {
1331 int tid = SQ ? sam_hdr_name2tid(header, SQ) : 0;
1332 return tid >= 0 ? sam_hdr_tid2len(header, tid) : -1;
1333 } else {
1334 int nref = sam_hdr_nref(header), tid;;
1335 int64_t len = 0;
1336 for (tid = 0; tid < nref; tid++) {
1337 int64_t rl = sam_hdr_tid2len(header, tid);
1338 if (len < rl)
1339 len = rl;
1340 }
1341 return len;
1342 }
1343 }
1344
amplicon_stats(astats_args_t * args,khash_t (bed_list_hash)* bed_hash,char ** filev,int filec)1345 static int amplicon_stats(astats_args_t *args,
1346 khash_t(bed_list_hash) *bed_hash,
1347 char **filev, int filec) {
1348 int i, ref = -1, ref_tid = -1, ret = -1, nref = 0;
1349 samFile *fp = NULL;
1350 sam_hdr_t *header = NULL;
1351 bam1_t *b = bam_init1();
1352 FILE *ofp = args->out_fp;
1353 char sname_[8192], *sname = NULL;
1354 amplicons_t *amps = NULL;
1355
1356 // Report initial SS header. We gather data from the bed_hash entries
1357 // as well as from the first SAM header (with the requirement that all
1358 // headers should be compatible).
1359 if (filec) {
1360 if (!(fp = sam_open_format(filev[0], "r", &args->ga.in))) {
1361 print_error_errno("ampliconstats",
1362 "Cannot open input file \"%s\"",
1363 filev[0]);
1364 goto err;
1365 }
1366 if (!(header = sam_hdr_read(fp)))
1367 goto err;
1368
1369 if (!amps) {
1370 amps = calloc(nref=sam_hdr_nref(header), sizeof(*amps));
1371 if (!amps)
1372 goto err;
1373 fprintf(ofp, "# Summary statistics, used for scaling the plots.\n");
1374 fprintf(ofp, "SS\tSamtools version: %s\n", samtools_version());
1375 fprintf(ofp, "SS\tCommand line: %s\n", args->argv);
1376 fprintf(ofp, "SS\tNumber of files:\t%d\n", filec);
1377
1378 // Note: order of hash entries will be different to order of
1379 // BED file which may also differ to order of SQ headers.
1380 // SQ header is canonical ordering (pos sorted file).
1381 khiter_t k;
1382 int bam_nref = sam_hdr_nref(header);
1383 for (i = 0; i < bam_nref; i++) {
1384 k = kh_get(bed_list_hash, bed_hash,
1385 sam_hdr_tid2name(header, i));
1386 if (!kh_exist(bed_hash, k))
1387 continue;
1388
1389 bed_entry_list_t *sites = &kh_value(bed_hash, k);
1390
1391 ref = i;
1392 amps[ref].ref = kh_key(bed_hash, k);
1393 amps[ref].sites = sites;
1394 amps[ref].namp = count_amplicon(sites);
1395 amps[ref].amp = calloc(sites->length,
1396 sizeof(*amps[ref].amp));
1397 if (!amps[ref].amp)
1398 goto err;
1399 if (args->multi_ref)
1400 fprintf(ofp, "SS\tNumber of amplicons:\t%s\t%d\n",
1401 kh_key(bed_hash, k), amps[ref].namp);
1402 else
1403 fprintf(ofp, "SS\tNumber of amplicons:\t%d\n",
1404 amps[ref].namp);
1405
1406 amps[ref].tid = ref;
1407 if (ref_tid == -1)
1408 ref_tid = ref;
1409
1410 int64_t len = get_ref_len(header, kh_key(bed_hash, k));
1411 amps[ref].len = len;
1412 if (args->multi_ref)
1413 fprintf(ofp, "SS\tReference length:\t%s\t%"PRId64"\n",
1414 kh_key(bed_hash, k), len);
1415 else
1416 fprintf(ofp, "SS\tReference length:\t%"PRId64"\n",
1417 len);
1418
1419 amps[ref].lstats = stats_alloc(len, args->max_amp,
1420 args->max_amp_len);
1421 amps[ref].gstats = stats_alloc(len, args->max_amp,
1422 args->max_amp_len);
1423 if (!amps[ref].lstats || !amps[ref].gstats)
1424 goto err;
1425 }
1426 }
1427
1428 sam_hdr_destroy(header);
1429 header = NULL;
1430 if (sam_close(fp) < 0) {
1431 fp = NULL;
1432 goto err;
1433 }
1434 fp = NULL;
1435 }
1436 fprintf(ofp, "SS\tEnd of summary\n");
1437
1438 // Extract the bits of amplicon data we need from bed hash and turn
1439 // it into a position-to-amplicon lookup table.
1440 int offset = 0;
1441 for (i = 0; i < nref; i++) {
1442 if (!amps[i].sites)
1443 continue;
1444
1445 amps[i].first_amp = offset;
1446 if (bed2amplicon(args, amps[i].sites, amps[i].amp,
1447 &s[i].namp, i==0, amps[i].ref, offset) < 0)
1448 goto err;
1449
1450 offset += amps[i].namp; // cumulative amplicon number across refs
1451 }
1452
1453 // Now iterate over file contents, one at a time.
1454 for (i = 0; i < filec; i++) {
1455 char *nstart = filev[i];
1456
1457 fp = sam_open_format(filev[i], "r", &args->ga.in);
1458 if (!fp) {
1459 print_error_errno("ampliconstats",
1460 "Cannot open input file \"%s\"",
1461 filev[i]);
1462 goto err;
1463 }
1464
1465 if (args->ga.nthreads > 0)
1466 hts_set_threads(fp, args->ga.nthreads);
1467
1468 if (!(header = sam_hdr_read(fp)))
1469 goto err;
1470
1471 if (nref != sam_hdr_nref(header)) {
1472 print_error_errno("ampliconstats",
1473 "SAM headers are not consistent across input files");
1474 goto err;
1475 }
1476 int r;
1477 for (r = 0; r < nref; r++) {
1478 if (!amps[r].ref ||
1479 strcmp(amps[r].ref, sam_hdr_tid2name(header, r)) != 0 ||
1480 amps[r].len != sam_hdr_tid2len(header, r)) {
1481 print_error_errno("ampliconstats",
1482 "SAM headers are not consistent across "
1483 "input files");
1484 goto err;
1485 }
1486 }
1487
1488 if (args->use_sample_name)
1489 sname = (char *)get_sample_name(header, NULL);
1490
1491 if (!sname) {
1492 sname = sname_;
1493 char *nend = filev[i] + strlen(filev[i]), *cp;
1494 if ((cp = strrchr(filev[i], '/')))
1495 nstart = cp+1;
1496 if ((cp = strrchr(nstart, '.')) &&
1497 (strcmp(cp, ".bam") == 0 ||
1498 strcmp(cp, ".sam") == 0 ||
1499 strcmp(cp, ".cram") == 0))
1500 nend = cp;
1501 if (nend - nstart >= 8192) nend = nstart+8191;
1502 memcpy(sname, nstart, nend-nstart);
1503 sname[nend-nstart] = 0;
1504 }
1505
1506 // Stats local to this sample only
1507 amp_stats_reset(amps, nref);
1508
1509 int last_ref = -9;
1510 while ((r = sam_read1(fp, header, b)) >= 0) {
1511 // Other filter options useful here?
1512 if (b->core.tid < 0)
1513 continue;
1514
1515 if (last_ref != b->core.tid) {
1516 last_ref = b->core.tid;
1517 if (initialise_amp_pos_lookup(args, amps, last_ref) < 0)
1518 goto err;
1519 }
1520
1521 if (accumulate_stats(args, amps, b) < 0)
1522 goto err;
1523 }
1524
1525 if (r < -1) {
1526 print_error_errno("ampliconstats", "Fail reading record");
1527 goto err;
1528 }
1529
1530 sam_hdr_destroy(header);
1531 if (sam_close(fp) < 0) {
1532 fp = NULL;
1533 goto err;
1534 }
1535
1536 fp = NULL;
1537 header = NULL;
1538
1539 if (dump_lstats(args, 'F', sname, filec, amps, nref) < 0)
1540 goto err;
1541
1542 if (append_stats(amps, nref) < 0)
1543 goto err;
1544
1545 if (sname && sname != sname_)
1546 free(sname);
1547 sname = NULL;
1548 }
1549
1550 if (dump_gstats(args, 'C', "COMBINED", filec, amps, nref) < 0)
1551 goto err;
1552
1553 ret = 0;
1554 err:
1555 bam_destroy1(b);
1556 if (ret) {
1557 if (header)
1558 sam_hdr_destroy(header);
1559 if (fp)
1560 sam_close(fp);
1561 }
1562 for (i = 0; i < nref; i++) {
1563 stats_free(amps[i].lstats);
1564 stats_free(amps[i].gstats);
1565 free(amps[i].amp);
1566 }
1567 free(amps);
1568 free(pos2start);
1569 free(pos2end);
1570 if (ret) {
1571 if (sname && sname != sname_)
1572 free(sname);
1573 }
1574
1575 return ret;
1576 }
1577
usage(astats_args_t * args,FILE * fp,int exit_status)1578 static int usage(astats_args_t *args, FILE *fp, int exit_status) {
1579 fprintf(fp,
1580 "\n"
1581 "Usage: samtools ampliconstats [options] primers.bed *.bam > astats.txt\n"
1582 "\n"
1583 "Options:\n");
1584 fprintf(fp, " -f, --required-flag STR|INT\n"
1585 " Only include reads with all of the FLAGs present [0x%X]\n",args->flag_require);
1586 fprintf(fp, " -F, --filter-flag STR|INT\n"
1587 " Only include reads with none of the FLAGs present [0x%X]\n",args->flag_filter & 0xffff);
1588 fprintf(fp, " -a, --max-amplicons INT\n"
1589 " Change the maximum number of amplicons permitted [%d]\n", MAX_AMP);
1590 fprintf(fp, " -l, --max-amplicon-length INT\n"
1591 " Change the maximum length of an individual amplicon [%d]\n", MAX_AMP_LEN);
1592 fprintf(fp, " -d, --min-depth INT[,INT]...\n"
1593 " Minimum base depth(s) to consider position covered [%d]\n", args->min_depth[0]);
1594 fprintf(fp, " -m, --pos-margin INT\n"
1595 " Margin of error for matching primer positions [%d]\n", args->max_delta);
1596 fprintf(fp, " -o, --output FILE\n"
1597 " Specify output file [stdout if unset]\n");
1598 fprintf(fp, " -s, --use-sample-name\n"
1599 " Use the sample name from the first @RG header line\n");
1600 fprintf(fp, " -t, --tlen-adjust INT\n"
1601 " Add/subtract from TLEN; use when clipping but no fixmate step\n");
1602 fprintf(fp, " -b, --tcoord-bin INT\n"
1603 " Bin template start,end positions into multiples of INT[1]\n");
1604 fprintf(fp, " -c, --tcoord-min-count INT\n"
1605 " Minimum template start,end frequency for recording [%d]\n", TCOORD_MIN_COUNT);
1606 fprintf(fp, " -D, --depth-bin FRACTION\n"
1607 " Merge FDP values within +/- FRACTION together\n");
1608 fprintf(fp, " -S, --single-ref\n"
1609 " Force single-ref (<=1.12) output format\n");
1610 sam_global_opt_help(fp, "I.--.@");
1611
1612 return exit_status;
1613 }
1614
main_ampliconstats(int argc,char ** argv)1615 int main_ampliconstats(int argc, char **argv) {
1616 astats_args_t args = {
1617 .ga = SAM_GLOBAL_ARGS_INIT,
1618 .flag_require = 0,
1619 .flag_filter = 0x10B04,
1620 //.sites = BED_LIST_INIT,
1621 .max_delta = 30, // large enough to cope with alt primers
1622 .min_depth = {1},
1623 .use_sample_name = 0,
1624 .max_amp = MAX_AMP,
1625 .max_amp_len = MAX_AMP_LEN,
1626 .tlen_adj = 0,
1627 .out_fp = stdout,
1628 .tcoord_min_count = TCOORD_MIN_COUNT,
1629 .tcoord_bin = 1,
1630 .depth_bin = 0.01,
1631 .multi_ref = 1
1632 }, oargs = args;
1633
1634 static const struct option loptions[] =
1635 {
1636 SAM_OPT_GLOBAL_OPTIONS('I', 0, '-', '-', 0, '@'),
1637 {"help", no_argument, NULL, 'h'},
1638 {"flag-require", required_argument, NULL, 'f'},
1639 {"flag-filter", required_argument, NULL, 'F'},
1640 {"min-depth", required_argument, NULL, 'd'},
1641 {"output", required_argument, NULL, 'o'},
1642 {"pos-margin", required_argument, NULL, 'm'},
1643 {"use-sample-name", no_argument, NULL, 's'},
1644 {"max-amplicons", required_argument, NULL, 'a'},
1645 {"max-amplicon-length", required_argument, NULL, 'l'},
1646 {"tlen-adjust", required_argument, NULL, 't'},
1647 {"tcoord-min-count", required_argument, NULL, 'c'},
1648 {"tcoord-bin", required_argument, NULL, 'b'},
1649 {"depth-bin", required_argument, NULL, 'D'},
1650 {"single-ref", no_argument, NULL, 'S'},
1651 {NULL, 0, NULL, 0}
1652 };
1653 int opt;
1654
1655 while ( (opt=getopt_long(argc,argv,"?hf:F:@:p:m:d:sa:l:t:o:c:b:D:S",loptions,NULL))>0 ) {
1656 switch (opt) {
1657 case 'f': args.flag_require = bam_str2flag(optarg); break;
1658 case 'F':
1659 if (args.flag_filter & 0x10000)
1660 args.flag_filter = 0; // strip default on first -F usage
1661 args.flag_filter |= bam_str2flag(optarg); break;
1662
1663 case 'm': args.max_delta = atoi(optarg); break; // margin
1664 case 'D': args.depth_bin = atof(optarg); break; // depth bin fraction
1665 case 'd': {
1666 int d = 0;
1667 char *cp = optarg, *ep;
1668 do {
1669 long n = strtol(cp, &ep, 10);
1670 args.min_depth[d++] = n;
1671 if (*ep != ',')
1672 break;
1673 cp = ep+1;
1674 } while (d < MAX_DEPTH);
1675 break;
1676 }
1677
1678 case 'a': args.max_amp = atoi(optarg)+1;break;
1679 case 'l': args.max_amp_len = atoi(optarg)+1;break;
1680
1681 case 'c': args.tcoord_min_count = atoi(optarg);break;
1682 case 'b':
1683 args.tcoord_bin = atoi(optarg);
1684 if (args.tcoord_bin < 1)
1685 args.tcoord_bin = 1;
1686 break;
1687
1688 case 't': args.tlen_adj = atoi(optarg);break;
1689
1690 case 's': args.use_sample_name = 1;break;
1691
1692 case 'o':
1693 if (!(args.out_fp = fopen(optarg, "w"))) {
1694 perror(optarg);
1695 return 1;
1696 }
1697 break;
1698
1699 case 'S':
1700 args.multi_ref = 0;
1701 break;
1702
1703 case '?': return usage(&oargs, stderr, EXIT_FAILURE);
1704 case 'h': return usage(&oargs, stdout, EXIT_SUCCESS);
1705
1706 default:
1707 if (parse_sam_global_opt(opt, optarg, loptions, &args.ga) != 0)
1708 usage(&oargs,stderr, EXIT_FAILURE);
1709 break;
1710 }
1711 }
1712
1713 if (argc <= optind)
1714 return usage(&oargs, stdout, EXIT_SUCCESS);
1715 if (argc <= optind+1 && isatty(STDIN_FILENO))
1716 return usage(&oargs, stderr, EXIT_FAILURE);
1717
1718 khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
1719 if (load_bed_file_multi_ref(argv[optind], 1, 0, bed_hash)) {
1720 print_error_errno("ampliconstats",
1721 "Could not read file \"%s\"", argv[optind]);
1722 return 1;
1723
1724 }
1725
1726 khiter_t k, ref_count = 0;
1727 for (k = kh_begin(bed_hash); k != kh_end(bed_hash); k++) {
1728 if (!kh_exist(bed_hash, k))
1729 continue;
1730 ref_count++;
1731 }
1732 if (ref_count == 0)
1733 return 1;
1734 if (ref_count > 1 && args.multi_ref == 0) {
1735 print_error("ampliconstats",
1736 "Single-ref mode is not permitted for BED files\n"
1737 "containing more than one reference.");
1738 return 1;
1739 }
1740
1741 args.argv = stringify_argv(argc, argv);
1742 int ret;
1743 if (argc == ++optind) {
1744 char *av = "-";
1745 ret = amplicon_stats(&args, bed_hash, &av, 1);
1746 } else {
1747 ret = amplicon_stats(&args, bed_hash, &argv[optind], argc-optind);
1748 }
1749
1750 free(args.argv);
1751 destroy_bed_hash(bed_hash);
1752
1753 return ret;
1754 }
1755