1 #include "samtools.pysam.h"
2
3 /* bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads
4 from the 5' end.
5
6 Copyright (C) 2020-2021 Genome Research Ltd.
7
8 Authors: Andrew Whitwham <aw7@sanger.ac.uk>
9 Rob Davies <rmd+git@sanger.ac.uk>
10
11 Permission is hereby granted, free of charge, to any person obtaining a copy
12 of this software and associated documentation files (the "Software"), to deal
13 in the Software without restriction, including without limitation the rights
14 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15 copies of the Software, and to permit persons to whom the Software is
16 furnished to do so, subject to the following conditions:
17
18 The above copyright notice and this permission notice shall be included in
19 all copies or substantial portions of the Software.
20
21 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 DEALINGS IN THE SOFTWARE
28 */
29
30 #include <config.h>
31
32 #include <assert.h>
33 #include <stdbool.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37 #include <errno.h>
38 #include "htslib/thread_pool.h"
39 #include "sam_opts.h"
40 #include <htslib/hts.h>
41 #include "htslib/hfile.h"
42 #include "htslib/kstring.h"
43 #include "htslib/sam.h"
44 #include "samtools.h"
45 #include "bam_ampliconclip.h"
46
47 typedef enum {
48 soft_clip,
49 hard_clip
50 } clipping_type;
51
52 typedef struct {
53 int add_pg;
54 int use_strand;
55 int write_clipped;
56 int mark_fail;
57 int both;
58 int fail_len;
59 int filter_len;
60 int unmapped;
61 int oa_tag;
62 int del_tag;
63 int tol;
64 char *arg_list;
65 char *stats_file;
66 char *rejects_file;
67 } cl_param_t;
68
69
bed_entry_sort(const void * av,const void * bv)70 static int bed_entry_sort(const void *av, const void *bv) {
71 bed_entry_t *a = (bed_entry_t *) av;
72 bed_entry_t *b = (bed_entry_t *) bv;
73 return a->right < b->right ? -1 : (a->right == b->right ? 0 : 1);
74 }
75
76
load_bed_file_multi_ref(char * infile,int get_strand,int sort_by_pos,khash_t (bed_list_hash)* bed_lists)77 int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash_t(bed_list_hash) *bed_lists) {
78 hFILE *fp;
79 int line_count = 0, ret;
80 int64_t left, right;
81 kstring_t line = KS_INITIALIZE;
82 bed_entry_list_t *list;
83 khiter_t bed_itr;
84
85 if ((fp = hopen(infile, "r")) == NULL) {
86 print_error_errno("amplicon", "unable to open file %s.", infile);
87 return 1;
88 }
89
90 char ref[1024];
91
92 while (line.l = 0, kgetline(&line, (kgets_func *)hgets, fp) >= 0) {
93 line_count++;
94 int hret;
95 char strand;
96
97 if (line.l == 0 || *line.s == '#') continue;
98 if (strncmp(line.s, "track ", 6) == 0) continue;
99 if (strncmp(line.s, "browser ", 8) == 0) continue;
100
101 if (get_strand) {
102 if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64" %*s %*s %c",
103 ref, &left, &right, &strand) != 4) {
104 fprintf(samtools_stderr, "[amplicon] error: bad bed file format in line %d of %s.\n"
105 "(N.B. ref/chrom name limited to 1023 characters.)\n",
106 line_count, infile);
107 ret = 1;
108 goto error;
109 }
110 } else {
111 if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64,
112 ref, &left, &right) != 3) {
113 fprintf(samtools_stderr, "[amplicon] error: bad bed file format in line %d of %s\n"
114 "(N.B. ref/chrom name limited to 1023 characters.)\n",
115 line_count, infile);
116 ret = 1;
117 goto error;
118 }
119 }
120
121 bed_itr = kh_get(bed_list_hash, bed_lists, ref);
122
123 if (bed_itr == kh_end(bed_lists)) { // new ref entry
124 char *ref_name = strdup(ref); // need a copy for the hash key
125
126 if (!ref_name) {
127 fprintf(samtools_stderr, "[amplicon] error: unable to allocate memory for ref name.\n");
128 ret = 1;
129 goto error;
130 }
131
132 bed_itr = kh_put(bed_list_hash, bed_lists, ref_name, &hret);
133
134 if (hret > 0) {
135 list = &kh_val(bed_lists, bed_itr);
136
137 // initialise the new hash entry
138 list->longest = 0;
139 list->size = 0;
140 list->length = 0;
141 list->bp = NULL;
142 } else {
143 fprintf(samtools_stderr, "[amplicon] error: ref hashing failure.\n");
144 ret = 1;
145 goto error;
146 }
147 } else { // existing ref
148 list = &kh_val(bed_lists, bed_itr);
149 }
150
151 if (list->length == list->size) {
152 bed_entry_t *tmp;
153
154 list->size += list->size / 2 + 256;
155
156 if ((tmp = realloc(list->bp, list->size * sizeof(bed_entry_t))) == NULL) {
157 fprintf(samtools_stderr, "[amplicon] error: unable to allocate more memory for bed data.\n");
158 ret = 1;
159 goto error;
160 }
161
162 list->bp = tmp;
163 }
164
165 list->bp[list->length].left = left;
166 list->bp[list->length].right = right;
167
168 if (get_strand) {
169 if (strand == '+') {
170 list->bp[list->length].rev = 0;
171 } else if (strand == '-') {
172 list->bp[list->length].rev = 1;
173 } else {
174 fprintf(samtools_stderr, "[amplicon] error: bad strand value in line %d, expecting '+' or '-', found '%c'.\n",
175 line_count, strand);
176 ret = 1;
177 goto error;
178 }
179 }
180
181 if (right - left > list->longest)
182 list->longest = right - left;
183
184 list->length++;
185 }
186
187 if (sort_by_pos) {
188 for (bed_itr = kh_begin(bed_lists); bed_itr != kh_end(bed_lists); ++bed_itr) {
189 if (kh_exist(bed_lists, bed_itr)) {
190 list = &kh_val(bed_lists, bed_itr);
191 qsort(list->bp, list->length, sizeof(list->bp[0]), bed_entry_sort);
192 }
193 }
194 }
195
196 if (kh_size(bed_lists) > 0) {// any entries
197 ret = 0;
198 } else {
199 ret = 1;
200 }
201
202 error:
203 ks_free(&line);
204
205 if (hclose(fp) != 0) {
206 fprintf(samtools_stderr, "[amplicon] warning: failed to close %s", infile);
207 }
208
209 return ret;
210 }
211
212
destroy_bed_hash(khash_t (bed_list_hash)* hash)213 void destroy_bed_hash(khash_t(bed_list_hash) *hash) {
214 khiter_t itr;
215
216 for (itr = kh_begin(hash); itr != kh_end(hash); ++itr) {
217 if (kh_exist(hash, itr)) {
218 free(kh_val(hash, itr).bp);
219 free((char *)kh_key(hash, itr));
220 kh_key(hash, itr) = NULL;
221 }
222 }
223
224 kh_destroy(bed_list_hash, hash);
225 }
226
227
matching_clip_site(bed_entry_list_t * sites,hts_pos_t pos,int is_rev,int use_strand,int64_t longest,cl_param_t * param)228 static int matching_clip_site(bed_entry_list_t *sites, hts_pos_t pos,
229 int is_rev, int use_strand, int64_t longest,
230 cl_param_t *param) {
231 int i, size; // may need this to be variable
232 int tol = param->tol;
233 int l = 0, mid = sites->length / 2, r = sites->length;
234 int pos_tol = is_rev ? (pos > tol ? pos - tol : 0) : pos;
235
236 while (r - l > 1) {
237 if (sites->bp[mid].right <= pos_tol) {
238 l = mid;
239 } else {
240 r = mid;
241 }
242 mid = (l + r) / 2;
243 }
244
245 size = 0;
246
247 for (i = l; i < sites->length; i++) {
248 hts_pos_t mod_left, mod_right;
249
250 if (use_strand && is_rev != sites->bp[i].rev)
251 continue;
252
253 if (is_rev) {
254 mod_left = sites->bp[i].left;
255 mod_right = sites->bp[i].right + tol;
256 } else {
257 if (sites->bp[i].left > tol) {
258 mod_left = sites->bp[i].left - tol;
259 } else {
260 mod_left = 0;
261 }
262 mod_right = sites->bp[i].right;
263 }
264
265 if (pos + longest + tol < mod_right)
266 break;
267
268 if (pos >= mod_left && pos <= mod_right) {
269 if (is_rev) {
270 if (size < pos - sites->bp[i].left) {
271 size = pos - sites->bp[i].left;
272 }
273 } else {
274 if (size < sites->bp[i].right - pos) {
275 size = sites->bp[i].right - pos;
276 }
277 }
278 }
279 }
280
281 return size;
282 }
283
284
bam_trim_left(bam1_t * rec,bam1_t * rec_out,uint32_t bases,clipping_type clipping)285 static int bam_trim_left(bam1_t *rec, bam1_t *rec_out, uint32_t bases,
286 clipping_type clipping) {
287 uint32_t *orig_cigar = bam_get_cigar(rec);
288 uint8_t *orig_seq = bam_get_seq(rec);
289 uint8_t *orig_qual = bam_get_qual(rec);
290 uint8_t *orig_aux = bam_get_aux(rec);
291 uint32_t *new_cigar;
292 uint8_t *new_qual;
293 size_t orig_l_aux = bam_get_l_aux(rec);
294 uint32_t i, j, odd_base = 0;
295 uint32_t ref_remove = bases, qry_removed = 0, hardclip = 0;
296 hts_pos_t new_pos = rec->core.pos;
297 uint32_t cig_type, cig_op;
298
299 if (rec->l_data + 8 > rec_out->m_data) {
300 uint8_t *new_data = realloc(rec_out->data, rec->l_data + 8);
301 if (!new_data) {
302 fprintf(samtools_stderr, "[ampliconclip] error: could not allocate memoy for new bam record\n");
303 return 1;
304 }
305 rec_out->data = new_data;
306 rec_out->m_data = rec->l_data + 8;
307 }
308
309 // Copy core data & name
310 memcpy(&rec_out->core, &rec->core, sizeof(rec->core));
311 memcpy(rec_out->data, rec->data, rec->core.l_qname);
312
313 if (clipping == hard_clip && bases >= rec->core.l_qseq) {
314 rec_out->core.l_qseq = 0;
315 rec_out->core.n_cigar = 0;
316
317 if (orig_l_aux)
318 memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
319
320 rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
321
322 return 0;
323 }
324
325 // Modify CIGAR
326 new_cigar = bam_get_cigar(rec_out);
327
328 for (i = 0; i < rec->core.n_cigar; i++) {
329 cig_op = bam_cigar_op(orig_cigar[i]);
330 cig_type = bam_cigar_type(cig_op);
331
332 if (cig_op == BAM_CHARD_CLIP) {
333 hardclip += bam_cigar_oplen(orig_cigar[i]);
334 } else {
335 if (cig_type & 2) {
336 if (bam_cigar_oplen(orig_cigar[i]) <= ref_remove) {
337 ref_remove -= bam_cigar_oplen(orig_cigar[i]);
338 } else {
339 break;
340 }
341 new_pos += bam_cigar_oplen(orig_cigar[i]);
342 }
343 if (cig_type & 1) {
344 qry_removed += bam_cigar_oplen(orig_cigar[i]);
345 }
346 }
347 }
348
349 if (i < rec->core.n_cigar) {
350 cig_type = bam_cigar_type(bam_cigar_op(orig_cigar[i]));
351
352 // account for the last operation
353 if (cig_type & 2) {
354 new_pos += ref_remove;
355 }
356 if (cig_type & 1) {
357 qry_removed += ref_remove;
358 }
359 } else {
360 qry_removed = rec->core.l_qseq;
361 }
362
363 j = 0;
364 if (clipping == hard_clip && hardclip + qry_removed > 0) {
365 new_cigar[j++] = bam_cigar_gen(hardclip + qry_removed, BAM_CHARD_CLIP);
366 }
367 if (clipping == soft_clip) {
368 if (hardclip > 0) {
369 new_cigar[j++] = bam_cigar_gen(hardclip, BAM_CHARD_CLIP);
370 }
371 if (qry_removed > 0) {
372 new_cigar[j++] = bam_cigar_gen(qry_removed, BAM_CSOFT_CLIP);
373 }
374 }
375
376 if (i < rec->core.n_cigar
377 && bam_cigar_oplen(orig_cigar[i]) > ref_remove) {
378 new_cigar[j++] = bam_cigar_gen(bam_cigar_oplen(orig_cigar[i]) - ref_remove, bam_cigar_op(orig_cigar[i]));
379
380 // fill in the rest of the cigar
381 i++;
382
383 for (; i < rec->core.n_cigar; i++) {
384 new_cigar[j++] = orig_cigar[i];
385 }
386 }
387
388 rec_out->core.n_cigar = j;
389
390 if (clipping == soft_clip) {
391 qry_removed = 0; // Copy all the sequence and confidence values
392 odd_base = 1; // account for an odd number of bases
393 }
394
395 new_qual = bam_get_seq(rec_out) + (rec->core.l_qseq - qry_removed + 1) / 2;
396 // Copy remaining SEQ
397 if ((qry_removed & 1) == 0) {
398 memcpy(bam_get_seq(rec_out), orig_seq + (qry_removed / 2),
399 (rec->core.l_qseq - qry_removed + odd_base) / 2);
400 } else {
401 uint8_t *in = orig_seq + qry_removed / 2;
402 uint8_t *out = bam_get_seq(rec_out);
403 uint32_t i;
404 for (i = qry_removed; i < rec->core.l_qseq - 1; i += 2) {
405 *out++ = ((in[0] & 0x0f) << 4) | ((in[1] & 0xf0) >> 4);
406 in++;
407 }
408 if (i < rec->core.l_qseq) {
409 *out++ = (in[0] & 0x0f) << 4;
410 }
411 assert(out == new_qual);
412 }
413
414 // Copy remaining QUAL
415 memmove(new_qual, orig_qual, rec->core.l_qseq - qry_removed);
416
417 // Set new l_qseq
418 rec_out->core.l_qseq -= qry_removed;
419
420 // Move AUX
421 if (orig_l_aux)
422 memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
423
424 // Set new l_data
425 rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
426
427 // put in new pos
428 rec_out->core.pos = new_pos;
429
430 return 0;
431 }
432
433
bam_trim_right(bam1_t * rec,bam1_t * rec_out,uint32_t bases,clipping_type clipping)434 static int bam_trim_right(bam1_t *rec, bam1_t *rec_out, uint32_t bases,
435 clipping_type clipping) {
436 uint32_t *orig_cigar = bam_get_cigar(rec);
437 uint8_t *orig_seq = bam_get_seq(rec);
438 uint8_t *orig_qual = bam_get_qual(rec);
439 uint8_t *orig_aux = bam_get_aux(rec);
440 uint32_t *new_cigar;
441 uint32_t new_n_cigar = 0;
442 uint8_t *new_qual;
443 size_t orig_l_aux = bam_get_l_aux(rec);
444 int32_t i;
445 int32_t j;
446 uint32_t ref_remove = bases, qry_removed = 0, hardclip = 0;
447 uint32_t cig_type, cig_op;
448
449 if (rec->l_data + 8 > rec_out->m_data) {
450 uint8_t *new_data = realloc(rec_out->data, rec->l_data + 8);
451 if (!new_data) {
452 fprintf(samtools_stderr, "[ampliconclip] error: could not allocate memoy for new bam record\n");
453 return 1;
454 }
455 rec_out->data = new_data;
456 rec_out->m_data = rec->l_data + 8;
457 }
458
459 // Copy core data & name
460 memcpy(&rec_out->core, &rec->core, sizeof(rec->core));
461 memcpy(rec_out->data, rec->data, rec->core.l_qname);
462
463 if (clipping == hard_clip && bases >= rec->core.l_qseq) {
464 rec_out->core.l_qseq = 0;
465 rec_out->core.n_cigar = 0;
466
467 if (orig_l_aux)
468 memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
469
470 rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
471 return 0;
472 }
473
474 // Modify CIGAR here
475 new_cigar = bam_get_cigar(rec_out);
476
477 for (i = rec->core.n_cigar - 1; i >= 0; --i) {
478 cig_op = bam_cigar_op(orig_cigar[i]);
479 cig_type = bam_cigar_type(cig_op);
480
481 if (cig_op == BAM_CHARD_CLIP) {
482 hardclip += bam_cigar_oplen(orig_cigar[i]);
483 } else {
484 if (cig_type & 2) {
485 if (bam_cigar_oplen(orig_cigar[i]) <= ref_remove) {
486 ref_remove -= bam_cigar_oplen(orig_cigar[i]);
487 } else {
488 break;
489 }
490 }
491 if (cig_type & 1) {
492 qry_removed += bam_cigar_oplen(orig_cigar[i]);
493 }
494 }
495 }
496
497 if (i >= 0) {
498 cig_type = bam_cigar_type(bam_cigar_op(orig_cigar[i]));
499 if (cig_type & 1) {
500 qry_removed += ref_remove;
501 }
502 j = i;
503 if (qry_removed > 0) j++;
504 if (hardclip > 0 && (clipping == soft_clip || qry_removed == 0)) j++;
505 } else {
506 qry_removed = rec->core.l_qseq;
507 j = 0;
508 if (hardclip > 0 && clipping == soft_clip) j++;
509 }
510
511 if (clipping == hard_clip && hardclip + qry_removed > 0) {
512 new_cigar[j] = bam_cigar_gen(hardclip + qry_removed, BAM_CHARD_CLIP);
513 new_n_cigar++;
514 }
515 if (clipping == soft_clip) {
516 if (hardclip > 0) {
517 new_cigar[j] = bam_cigar_gen(hardclip, BAM_CHARD_CLIP);
518 new_n_cigar++;
519 if (qry_removed > 0) --j;
520 }
521 if (qry_removed > 0) {
522 new_cigar[j] = bam_cigar_gen(qry_removed, BAM_CSOFT_CLIP);
523 new_n_cigar++;
524 }
525 }
526
527 if (j > 0) {
528 new_cigar[--j] = bam_cigar_gen(bam_cigar_oplen(orig_cigar[i]) - ref_remove, bam_cigar_op(orig_cigar[i]));
529 new_n_cigar++;
530 }
531
532 // fill in the rest of the cigar
533 while (j > 0) {
534 new_cigar[--j] = orig_cigar[--i];
535 new_n_cigar++;
536 }
537
538 rec_out->core.n_cigar = new_n_cigar;
539
540 if (clipping == soft_clip)
541 qry_removed = 0; // Copy all the sequence and confidence values
542
543 new_qual = bam_get_seq(rec_out) + (rec->core.l_qseq - qry_removed + 1) / 2;
544 // Copy remaining SEQ
545 memcpy(bam_get_seq(rec_out), orig_seq, (rec->core.l_qseq - qry_removed + 1) / 2);
546
547 // Copy remaining QUAL
548 memcpy(new_qual, orig_qual, rec->core.l_qseq - qry_removed);
549
550 // Set new l_qseq
551 rec_out->core.l_qseq -= qry_removed;
552
553 // Copy AUX
554 if (orig_l_aux)
555 memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
556
557 // Set new l_data
558 rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
559
560 return 0;
561 }
562
563
active_query_len(bam1_t * b)564 static hts_pos_t active_query_len(bam1_t *b) {
565 uint32_t *cigar = bam_get_cigar(b);
566 uint32_t cig_type, cig_op;
567 hts_pos_t len = 0;
568 int i;
569
570 for (i = 0; i < b->core.n_cigar; i++) {
571 cig_op = bam_cigar_op(cigar[i]);
572 cig_type = bam_cigar_type(cig_op);
573
574 if ((cig_type & 1) && (cig_op != BAM_CSOFT_CLIP)) {
575 len += bam_cigar_oplen(cigar[i]);
576 }
577 }
578
579 return len;
580 }
581
582
swap_bams(bam1_t ** a,bam1_t ** b)583 static inline void swap_bams(bam1_t **a, bam1_t **b) {
584 bam1_t *tmp = *a;
585 *a = *b;
586 *b = tmp;
587 }
588
589
590 // Format OA:Z:(RNAME,POS,strand,CIGAR,MAPQ,NM;
tag_original_data(bam1_t * orig,kstring_t * oa_tag)591 static inline int tag_original_data(bam1_t *orig, kstring_t *oa_tag) {
592 char strand;
593 uint8_t *nm_tag, *old_oa_tag;
594 uint32_t *cigar;
595 int64_t nm = 0;
596 int i, res = 0;
597
598 ks_clear(oa_tag);
599
600 // if there is an existing OA tag the new one gets appended to it
601 if ((old_oa_tag = bam_aux_get(orig, "OA"))) {
602 res |= ksprintf(oa_tag, "%s", bam_aux2Z(old_oa_tag)) < 0;
603 }
604
605 if (orig->core.flag & BAM_FREVERSE)
606 strand = '-';
607 else
608 strand = '+';
609
610 if ((nm_tag = bam_aux_get(orig, "NM"))) {
611 nm = bam_aux2i(nm_tag);
612 }
613
614 res |= ksprintf(oa_tag, "%s,%"PRIhts_pos",%c,", bam_get_qname(orig), orig->core.pos + 1, strand) < 0;
615
616 for (i = 0, cigar = bam_get_cigar(orig); i < orig->core.n_cigar && res == 0; ++i) {
617 res |= kputw(bam_cigar_oplen(cigar[i]), oa_tag) < 0;
618 res |= kputc(bam_cigar_opchr(cigar[i]), oa_tag) < 0;
619 }
620
621 if (nm_tag) {
622 res |= ksprintf(oa_tag, ",%d,%"PRId64";", orig->core.qual, nm) < 0;
623 } else {
624 res |= ksprintf(oa_tag, "%d,;", orig->core.qual) < 0;
625 }
626
627 return res;
628 }
629
630
bam_clip(samFile * in,samFile * out,samFile * reject,char * bedfile,clipping_type clipping,cl_param_t * param)631 static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile,
632 clipping_type clipping, cl_param_t *param) {
633 int ret = 1, r, file_open = 0;
634
635 bam_hdr_t *header = NULL;
636 bam1_t *b = NULL, *b_tmp = NULL;
637 long f_count = 0, r_count = 0, n_count = 0, l_count = 0, l_exclude = 0, b_count = 0;
638 long filtered = 0, written = 0, failed = 0;
639 kstring_t str = KS_INITIALIZE;
640 kstring_t oat = KS_INITIALIZE;
641 bed_entry_list_t *sites;
642 FILE *stats_fp = samtools_stderr;
643 khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
644
645 if (load_bed_file_multi_ref(bedfile, param->use_strand, 1, bed_hash)) {
646 fprintf(samtools_stderr, "[ampliconclip] error: unable to load bed file.\n");
647 goto fail;
648 }
649
650 if ((header = sam_hdr_read(in)) == NULL) {
651 fprintf(samtools_stderr, "[ampliconclip] error: could not read header\n");
652 goto fail;
653 }
654
655 // changing pos can ruin coordinate sort order
656 if (sam_hdr_find_tag_hd(header, "SO", &str) == 0 && str.s && strcmp(str.s, "coordinate") == 0) {
657 const char *new_order = "unknown";
658
659 if (sam_hdr_update_hd(header, "SO", new_order) == -1) {
660 fprintf(samtools_stderr, "[ampliconclip] error: unable to change sort order to 'SO:%s'\n", new_order);
661 goto fail;
662 }
663 }
664
665 ks_free(&str);
666
667 if (param->add_pg && sam_hdr_add_pg(header, "samtools", "VN", samtools_version(),
668 param->arg_list ? "CL" : NULL,
669 param->arg_list ? param->arg_list : NULL,
670 NULL) != 0) {
671 fprintf(samtools_stderr, "[ampliconclip] warning: unable to add @PG line to header.\n");
672 }
673 if (sam_hdr_write(out, header) < 0) {
674 fprintf(samtools_stderr, "[ampliconclip] error: could not write header.\n");
675 goto fail;
676 }
677
678 if (reject) {
679 if (sam_hdr_write(reject, header) < 0) {
680 fprintf(samtools_stderr, "[ampliconclip] error: could not write header to rejects file.\n");
681 goto fail;
682 }
683 }
684
685 b = bam_init1();
686 b_tmp = bam_init1();
687 if (!b || !b_tmp) {
688 fprintf(samtools_stderr, "[ampliconclip] error: out of memory when trying to create record.\n");
689 goto fail;
690 }
691
692 int32_t last_tid = -1;
693 int ref_found = 0;
694
695 while ((r = sam_read1(in, header, b)) >= 0) {
696 hts_pos_t pos;
697 int is_rev;
698 int p_size;
699 int been_clipped = 0, filter = 0;
700 int exclude = (BAM_FUNMAP | BAM_FQCFAIL);
701 khiter_t itr;
702
703 l_count++;
704
705 if (b->core.tid != last_tid) {
706 const char *ref_name;
707
708 ref_found = 0;
709 last_tid = b->core.tid;
710
711 if ((ref_name = sam_hdr_tid2name(header, b->core.tid)) != NULL) {
712 itr = kh_get(bed_list_hash, bed_hash, ref_name);
713
714 if (itr != kh_end(bed_hash)) {
715 sites = &kh_val(bed_hash, itr);
716 ref_found = 1;
717 }
718 }
719 }
720
721 if (!(b->core.flag & exclude) && ref_found) {
722 if (param->oa_tag)
723 if (tag_original_data(b, &oat))
724 goto fail;
725
726 if (!param->both) {
727 if (bam_is_rev(b)) {
728 pos = bam_endpos(b);
729 is_rev = 1;
730 } else {
731 pos = b->core.pos;
732 is_rev = 0;
733 }
734
735 if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) {
736 if (is_rev) {
737 if (bam_trim_right(b, b_tmp, p_size, clipping) != 0)
738 goto fail;
739
740 swap_bams(&b, &b_tmp);
741 r_count++;
742 } else {
743 if (bam_trim_left(b, b_tmp, p_size, clipping) != 0)
744 goto fail;
745
746 swap_bams(&b, &b_tmp);
747 f_count++;
748 }
749
750 if (param->oa_tag) {
751 if (bam_aux_update_str(b, "OA", oat.l + 1, (const char *)oat.s))
752 goto fail;
753 }
754
755 if (param->del_tag) {
756 uint8_t *tag;
757
758 if ((tag = bam_aux_get(b, "NM")))
759 bam_aux_del(b, tag);
760
761 if ((tag = bam_aux_get(b, "MD")))
762 bam_aux_del(b, tag);
763 }
764
765 been_clipped = 1;
766 } else {
767 if (param->mark_fail) {
768 b->core.flag |= BAM_FQCFAIL;
769 }
770
771 n_count++;
772 }
773 } else {
774 int left = 0, right = 0;
775
776 // left first
777 pos = b->core.pos;
778 is_rev = 0;
779
780 if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) {
781 if (bam_trim_left(b, b_tmp, p_size, clipping) != 0)
782 goto fail;
783
784 swap_bams(&b, &b_tmp);
785 f_count++;
786 left = 1;
787 been_clipped = 1;
788 }
789
790 // the right
791 pos = bam_endpos(b);
792 is_rev = 1;
793
794 if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) {
795 if (bam_trim_right(b, b_tmp, p_size, clipping) != 0)
796 goto fail;
797
798 swap_bams(&b, &b_tmp);
799 r_count++;
800 right = 1;
801 been_clipped = 1;
802 }
803
804 if (left || right) {
805 uint8_t *tag;
806
807 if (param->oa_tag) {
808 if (bam_aux_update_str(b, "OA", oat.l + 1, (const char *)oat.s))
809 goto fail;
810 }
811
812 if (param->del_tag) {
813 if ((tag = bam_aux_get(b, "NM")))
814 bam_aux_del(b, tag);
815
816 if ((tag = bam_aux_get(b, "MD")))
817 bam_aux_del(b, tag);
818 }
819 }
820
821 if (left && right) {
822 b_count++;
823 } else if (!left && !right) {
824 if (param->mark_fail) {
825 b->core.flag |= BAM_FQCFAIL;
826 }
827
828 n_count++;
829 }
830 }
831
832 if (param->fail_len >= 0 || param->filter_len >= 0) {
833 hts_pos_t aql = active_query_len(b);
834
835 if (param->fail_len >= 0 && aql <= param->fail_len) {
836 b->core.flag |= BAM_FQCFAIL;
837 }
838
839 if (param->filter_len >= 0 && aql <= param->filter_len) {
840 filter = 1;
841 }
842 }
843
844 if (b->core.flag & BAM_FQCFAIL) {
845 failed++;
846 }
847
848 if (param->write_clipped && !been_clipped) {
849 filter = 1;
850 }
851
852 } else {
853 l_exclude++;
854
855 if (param->unmapped) {
856 filter = 1;
857 }
858 }
859
860 if (!filter) {
861 if (sam_write1(out, header, b) < 0) {
862 fprintf(samtools_stderr, "[ampliconclip] error: could not write line %ld.\n", l_count);
863 goto fail;
864 }
865
866 written++;
867 } else {
868 if (reject) {
869 if (sam_write1(reject, header, b) < 0) {
870 fprintf(samtools_stderr, "[ampliconclip] error: could not write to reject file %s\n",
871 param->rejects_file);
872 goto fail;
873 }
874 }
875
876 filtered++;
877 }
878 }
879
880 if (r < -1) {
881 fprintf(samtools_stderr, "[ampliconclip] error: failed to read input.\n");
882 goto fail;
883 }
884
885 if (param->stats_file) {
886 if ((stats_fp = fopen(param->stats_file, "w")) == NULL) {
887 fprintf(samtools_stderr, "[ampliconclip] warning: cannot write stats to %s.\n", param->stats_file);
888 } else {
889 file_open = 1;
890 }
891 }
892
893 fprintf(stats_fp, "COMMAND: %s\n"
894 "TOTAL READS: %ld\n"
895 "TOTAL CLIPPED: %ld\n"
896 "FORWARD CLIPPED: %ld\n"
897 "REVERSE CLIPPED: %ld\n"
898 "BOTH CLIPPED: %ld\n"
899 "NOT CLIPPED: %ld\n"
900 "EXCLUDED: %ld\n"
901 "FILTERED: %ld\n"
902 "FAILED: %ld\n"
903 "WRITTEN: %ld\n", param->arg_list, l_count, f_count + r_count,
904 f_count, r_count, b_count, n_count, l_exclude,
905 filtered, failed, written);
906
907 if (file_open) {
908 fclose(stats_fp);
909 }
910
911 ret = 0;
912
913 fail:
914 destroy_bed_hash(bed_hash);
915 ks_free(&oat);
916 sam_hdr_destroy(header);
917 bam_destroy1(b);
918 bam_destroy1(b_tmp);
919 return ret;
920 }
921
922
usage(void)923 static void usage(void) {
924 fprintf(samtools_stderr, "Usage: samtools ampliconclip -b BED file <input.bam> -o <output.bam>\n\n");
925 fprintf(samtools_stderr, "Option: \n");
926 fprintf(samtools_stderr, " -b FILE BED file of regions (eg amplicon primers) to be removed.\n");
927 fprintf(samtools_stderr, " -o FILE output file name (default samtools_stdout).\n");
928 fprintf(samtools_stderr, " -f FILE write stats to file name (default samtools_stderr)\n");
929 fprintf(samtools_stderr, " -u Output uncompressed data\n");
930 fprintf(samtools_stderr, " --soft-clip soft clip amplicon primers from reads (default)\n");
931 fprintf(samtools_stderr, " --hard-clip hard clip amplicon primers from reads.\n");
932 fprintf(samtools_stderr, " --both-ends clip on both 5' and 3' ends.\n");
933 fprintf(samtools_stderr, " --strand use strand data from BED file to match read direction.\n");
934 fprintf(samtools_stderr, " --clipped only output clipped reads.\n");
935 fprintf(samtools_stderr, " --fail mark unclipped, mapped reads as QCFAIL.\n");
936 fprintf(samtools_stderr, " --filter-len INT do not output reads INT size or shorter.\n");
937 fprintf(samtools_stderr, " --fail-len INT mark as QCFAIL reads INT size or shorter.\n");
938 fprintf(samtools_stderr, " --no-excluded do not write excluded reads (unmapped or QCFAIL).\n");
939 fprintf(samtools_stderr, " --rejects-file FILE file to write filtered reads.\n");
940 fprintf(samtools_stderr, " --original for clipped entries add an OA tag with original data.\n");
941 fprintf(samtools_stderr, " --keep-tag for clipped entries keep the old NM and MD tags.\n");
942 fprintf(samtools_stderr, " --tolerance match region within this number of bases, default 5.\n");
943 fprintf(samtools_stderr, " --no-PG do not add an @PG line.\n");
944 sam_global_opt_help(samtools_stderr, "-.O..@-.");
945 fprintf(samtools_stderr, "\nAbout: Soft clips read alignments where they match BED file defined regions.\n"
946 "Default clipping is only on the 5' end.\n\n");
947 }
948
949
amplicon_clip_main(int argc,char ** argv)950 int amplicon_clip_main(int argc, char **argv) {
951 int c, ret;
952 char wmode[4] = {'w', 'b', 0, 0};
953 char *bedfile = NULL, *fnout = "-";
954 sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
955 htsThreadPool p = {NULL, 0};
956 samFile *in = NULL, *out = NULL, *reject = NULL;
957 clipping_type clipping = soft_clip;
958 cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, NULL, NULL, NULL};
959
960 static const struct option lopts[] = {
961 SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
962 {"no-PG", no_argument, NULL, 1002},
963 {"soft-clip", no_argument, NULL, 1003},
964 {"hard-clip", no_argument, NULL, 1004},
965 {"strand", no_argument, NULL, 1005},
966 {"clipped", no_argument, NULL, 1006},
967 {"fail", no_argument, NULL, 1007},
968 {"both-ends", no_argument, NULL, 1008},
969 {"filter-len", required_argument, NULL, 1009},
970 {"fail-len", required_argument, NULL, 1010},
971 {"no-excluded", no_argument, NULL, 1011},
972 {"rejects-file", required_argument, NULL, 1012},
973 {"original", no_argument, NULL, 1013},
974 {"keep-tag", no_argument, NULL, 1014},
975 {"tolerance", required_argument, NULL, 1015},
976 {NULL, 0, NULL, 0}
977 };
978
979 while ((c = getopt_long(argc, argv, "b:@:o:O:f:u", lopts, NULL)) >= 0) {
980 switch (c) {
981 case 'b': bedfile = optarg; break;
982 case 'o': fnout = optarg; break;
983 case 'f': param.stats_file = optarg; break;
984 case 'u': wmode[2] = '0'; break;
985 case 1002: param.add_pg = 0; break;
986 case 1003: clipping = soft_clip; break;
987 case 1004: clipping = hard_clip; break;
988 case 1005: param.use_strand = 1; break;
989 case 1006: param.write_clipped = 1; break;
990 case 1007: param.mark_fail = 1; break;
991 case 1008: param.both = 1; break;
992 case 1009: param.filter_len = atoi(optarg); break;
993 case 1010: param.fail_len = atoi(optarg); break;
994 case 1011: param.unmapped = 1; break;
995 case 1012: param.rejects_file = optarg; break;
996 case 1013: param.oa_tag = 1; break;
997 case 1014: param.del_tag = 0; break;
998 case 1015: param.tol = atoi(optarg); break;
999 default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
1000 /* else fall-through */
1001 case '?': usage(); samtools_exit(1);
1002 }
1003 }
1004
1005 if (!bedfile) {
1006 usage();
1007 return 1;
1008 }
1009
1010 if (optind + 1 > argc) {
1011 usage();
1012 return 1;
1013 }
1014
1015 if (param.tol < 0) {
1016 fprintf(samtools_stderr, "[ampliconclip] warning: invalid tolerance of %d,"
1017 " reseting tolerance to default of 5.\n", param.tol);
1018 param.tol = 5;
1019 }
1020
1021 if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) {
1022 print_error_errno("ampliconclip", "cannot open input file");
1023 return 1;
1024 }
1025
1026 sam_open_mode(wmode+1, fnout, NULL);
1027
1028 if ((out = sam_open_format(fnout, wmode, &ga.out)) == NULL) {
1029 print_error_errno("ampliconclip", "cannot open output file");
1030 return 1;
1031 }
1032
1033 if (param.rejects_file) {
1034 sam_open_mode(wmode+1, param.rejects_file, NULL);
1035
1036 if ((reject = sam_open_format(param.rejects_file, wmode, &ga.out)) == NULL) {
1037 print_error_errno("ampliconclip", "cannot open rejects file");
1038 return 1;
1039 }
1040 }
1041
1042 if (ga.nthreads > 0) {
1043 if (!(p.pool = hts_tpool_init(ga.nthreads))) {
1044 fprintf(samtools_stderr, "[ampliconclip] error: cannot create thread pool.\n");
1045 return 1;
1046 }
1047 hts_set_opt(in, HTS_OPT_THREAD_POOL, &p);
1048 hts_set_opt(out, HTS_OPT_THREAD_POOL, &p);
1049
1050 if (reject) {
1051 hts_set_opt(reject, HTS_OPT_THREAD_POOL, &p);
1052 }
1053 }
1054
1055 param.arg_list = stringify_argv(argc + 1, argv - 1);
1056
1057 ret = bam_clip(in, out, reject, bedfile, clipping, ¶m);
1058
1059 // cleanup
1060 sam_close(in);
1061
1062 if (sam_close(out) < 0) {
1063 fprintf(samtools_stderr, "[ampliconclip] error: error while closing output file %s.\n", argv[optind+1]);
1064 ret = 1;
1065 }
1066
1067 if (reject) {
1068 if (sam_close(reject) < 0) {
1069 fprintf(samtools_stderr, "[ampliconclip] error: error while closing reject file %s.\n", param.rejects_file);
1070 ret = 1;
1071 }
1072 }
1073
1074 if (p.pool) hts_tpool_destroy(p.pool);
1075
1076 sam_global_args_free(&ga);
1077 free(param.arg_list);
1078
1079 return ret;
1080 }
1081
1082