1 #include "samtools.pysam.h"
2 
3 /*  bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads
4                           from the 5' end.
5 
6     Copyright (C) 2020-2021 Genome Research Ltd.
7 
8     Authors: Andrew Whitwham <aw7@sanger.ac.uk>
9              Rob Davies <rmd+git@sanger.ac.uk>
10 
11 Permission is hereby granted, free of charge, to any person obtaining a copy
12 of this software and associated documentation files (the "Software"), to deal
13 in the Software without restriction, including without limitation the rights
14 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15 copies of the Software, and to permit persons to whom the Software is
16 furnished to do so, subject to the following conditions:
17 
18 The above copyright notice and this permission notice shall be included in
19 all copies or substantial portions of the Software.
20 
21 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 DEALINGS IN THE SOFTWARE
28 */
29 
30 #include <config.h>
31 
32 #include <assert.h>
33 #include <stdbool.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37 #include <errno.h>
38 #include "htslib/thread_pool.h"
39 #include "sam_opts.h"
40 #include <htslib/hts.h>
41 #include "htslib/hfile.h"
42 #include "htslib/kstring.h"
43 #include "htslib/sam.h"
44 #include "samtools.h"
45 #include "bam_ampliconclip.h"
46 
47 typedef enum {
48     soft_clip,
49     hard_clip
50 } clipping_type;
51 
52 typedef struct {
53     int add_pg;
54     int use_strand;
55     int write_clipped;
56     int mark_fail;
57     int both;
58     int fail_len;
59     int filter_len;
60     int unmapped;
61     int oa_tag;
62     int del_tag;
63     int tol;
64     char *arg_list;
65     char *stats_file;
66     char *rejects_file;
67 } cl_param_t;
68 
69 
bed_entry_sort(const void * av,const void * bv)70 static int bed_entry_sort(const void *av, const void *bv) {
71     bed_entry_t *a = (bed_entry_t *) av;
72     bed_entry_t *b = (bed_entry_t *) bv;
73     return a->right < b->right ? -1 : (a->right == b->right ? 0 : 1);
74 }
75 
76 
load_bed_file_multi_ref(char * infile,int get_strand,int sort_by_pos,khash_t (bed_list_hash)* bed_lists)77 int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash_t(bed_list_hash) *bed_lists) {
78     hFILE *fp;
79     int line_count = 0, ret;
80     int64_t left, right;
81     kstring_t line = KS_INITIALIZE;
82     bed_entry_list_t *list;
83     khiter_t bed_itr;
84 
85     if ((fp = hopen(infile, "r")) == NULL) {
86         print_error_errno("amplicon", "unable to open file %s.", infile);
87         return 1;
88     }
89 
90     char ref[1024];
91 
92     while (line.l = 0, kgetline(&line, (kgets_func *)hgets, fp) >= 0) {
93         line_count++;
94         int hret;
95         char strand;
96 
97         if (line.l == 0 || *line.s == '#') continue;
98         if (strncmp(line.s, "track ", 6) == 0) continue;
99         if (strncmp(line.s, "browser ", 8) == 0) continue;
100 
101         if (get_strand) {
102             if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64" %*s %*s %c",
103                        ref, &left, &right, &strand) != 4) {
104                 fprintf(samtools_stderr, "[amplicon] error: bad bed file format in line %d of %s.\n"
105                                 "(N.B. ref/chrom name limited to 1023 characters.)\n",
106                                     line_count, infile);
107                 ret = 1;
108                 goto error;
109             }
110         } else {
111             if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64,
112                        ref, &left, &right) != 3) {
113                 fprintf(samtools_stderr, "[amplicon] error: bad bed file format in line %d of %s\n"
114                                 "(N.B. ref/chrom name limited to 1023 characters.)\n",
115                                     line_count, infile);
116                 ret = 1;
117                 goto error;
118             }
119         }
120 
121         bed_itr = kh_get(bed_list_hash, bed_lists, ref);
122 
123         if (bed_itr == kh_end(bed_lists)) { // new ref entry
124             char *ref_name = strdup(ref); // need a copy for the hash key
125 
126             if (!ref_name) {
127                 fprintf(samtools_stderr, "[amplicon] error: unable to allocate memory for ref name.\n");
128                 ret = 1;
129                 goto error;
130             }
131 
132             bed_itr = kh_put(bed_list_hash, bed_lists, ref_name, &hret);
133 
134             if (hret > 0) {
135                 list = &kh_val(bed_lists, bed_itr);
136 
137                 // initialise the new hash entry
138                 list->longest = 0;
139                 list->size = 0;
140                 list->length = 0;
141                 list->bp = NULL;
142             } else {
143                 fprintf(samtools_stderr, "[amplicon] error: ref hashing failure.\n");
144                 ret = 1;
145                 goto error;
146             }
147         } else { // existing ref
148             list = &kh_val(bed_lists, bed_itr);
149         }
150 
151         if (list->length == list->size) {
152            bed_entry_t *tmp;
153 
154            list->size += list->size / 2 + 256;
155 
156            if ((tmp = realloc(list->bp, list->size * sizeof(bed_entry_t))) == NULL) {
157                fprintf(samtools_stderr, "[amplicon] error: unable to allocate more memory for bed data.\n");
158                ret = 1;
159                goto error;
160            }
161 
162            list->bp = tmp;
163         }
164 
165         list->bp[list->length].left  = left;
166         list->bp[list->length].right = right;
167 
168         if (get_strand) {
169             if (strand == '+') {
170                 list->bp[list->length].rev = 0;
171             } else if (strand == '-') {
172                 list->bp[list->length].rev = 1;
173             } else {
174                 fprintf(samtools_stderr, "[amplicon] error: bad strand value in line %d, expecting '+' or '-', found '%c'.\n",
175                             line_count, strand);
176                 ret = 1;
177                 goto error;
178             }
179         }
180 
181         if (right - left > list->longest)
182             list->longest = right - left;
183 
184         list->length++;
185     }
186 
187     if (sort_by_pos) {
188         for (bed_itr = kh_begin(bed_lists); bed_itr != kh_end(bed_lists); ++bed_itr) {
189             if (kh_exist(bed_lists, bed_itr)) {
190                 list = &kh_val(bed_lists, bed_itr);
191                 qsort(list->bp, list->length, sizeof(list->bp[0]), bed_entry_sort);
192             }
193         }
194     }
195 
196     if (kh_size(bed_lists) > 0) {// any entries
197         ret = 0;
198     } else {
199         ret = 1;
200     }
201 
202 error:
203     ks_free(&line);
204 
205     if (hclose(fp) != 0) {
206         fprintf(samtools_stderr, "[amplicon] warning: failed to close %s", infile);
207     }
208 
209     return ret;
210 }
211 
212 
destroy_bed_hash(khash_t (bed_list_hash)* hash)213 void destroy_bed_hash(khash_t(bed_list_hash) *hash) {
214     khiter_t itr;
215 
216     for (itr = kh_begin(hash); itr != kh_end(hash); ++itr) {
217        if (kh_exist(hash, itr)) {
218            free(kh_val(hash, itr).bp);
219            free((char *)kh_key(hash, itr));
220            kh_key(hash, itr) = NULL;
221         }
222     }
223 
224     kh_destroy(bed_list_hash, hash);
225 }
226 
227 
matching_clip_site(bed_entry_list_t * sites,hts_pos_t pos,int is_rev,int use_strand,int64_t longest,cl_param_t * param)228 static int matching_clip_site(bed_entry_list_t *sites, hts_pos_t pos,
229                               int is_rev, int use_strand, int64_t longest,
230                               cl_param_t *param) {
231     int i, size;  // may need this to be variable
232     int tol = param->tol;
233     int l = 0, mid = sites->length / 2, r = sites->length;
234     int pos_tol = is_rev ? (pos > tol ? pos - tol : 0) : pos;
235 
236     while (r - l > 1) {
237         if (sites->bp[mid].right <= pos_tol) {
238             l = mid;
239         } else {
240             r = mid;
241         }
242         mid = (l + r) / 2;
243     }
244 
245     size = 0;
246 
247     for (i = l; i < sites->length; i++) {
248         hts_pos_t mod_left, mod_right;
249 
250         if (use_strand && is_rev != sites->bp[i].rev)
251             continue;
252 
253         if (is_rev) {
254             mod_left = sites->bp[i].left;
255             mod_right = sites->bp[i].right + tol;
256         } else {
257             if (sites->bp[i].left > tol) {
258                 mod_left = sites->bp[i].left - tol;
259             } else {
260                 mod_left = 0;
261             }
262             mod_right = sites->bp[i].right;
263         }
264 
265         if (pos + longest + tol < mod_right)
266             break;
267 
268         if (pos >= mod_left && pos <= mod_right) {
269             if (is_rev) {
270                 if (size < pos - sites->bp[i].left) {
271                     size = pos - sites->bp[i].left;
272                 }
273             } else {
274                 if (size < sites->bp[i].right - pos) {
275                     size = sites->bp[i].right - pos;
276                 }
277             }
278         }
279     }
280 
281     return size;
282 }
283 
284 
bam_trim_left(bam1_t * rec,bam1_t * rec_out,uint32_t bases,clipping_type clipping)285 static int bam_trim_left(bam1_t *rec, bam1_t *rec_out, uint32_t bases,
286                          clipping_type clipping) {
287     uint32_t *orig_cigar = bam_get_cigar(rec);
288     uint8_t *orig_seq = bam_get_seq(rec);
289     uint8_t *orig_qual = bam_get_qual(rec);
290     uint8_t *orig_aux = bam_get_aux(rec);
291     uint32_t *new_cigar;
292     uint8_t *new_qual;
293     size_t orig_l_aux = bam_get_l_aux(rec);
294     uint32_t i, j, odd_base = 0;
295     uint32_t ref_remove = bases, qry_removed = 0, hardclip = 0;
296     hts_pos_t new_pos = rec->core.pos;
297     uint32_t cig_type, cig_op;
298 
299     if (rec->l_data + 8 > rec_out->m_data) {
300         uint8_t *new_data = realloc(rec_out->data, rec->l_data + 8);
301         if (!new_data) {
302             fprintf(samtools_stderr, "[ampliconclip] error: could not allocate memoy for new bam record\n");
303             return 1;
304         }
305         rec_out->data = new_data;
306         rec_out->m_data = rec->l_data + 8;
307     }
308 
309     // Copy core data & name
310     memcpy(&rec_out->core, &rec->core, sizeof(rec->core));
311     memcpy(rec_out->data, rec->data, rec->core.l_qname);
312 
313     if (clipping == hard_clip && bases >= rec->core.l_qseq) {
314         rec_out->core.l_qseq = 0;
315         rec_out->core.n_cigar = 0;
316 
317         if (orig_l_aux)
318             memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
319 
320         rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
321 
322         return 0;
323     }
324 
325     // Modify CIGAR
326     new_cigar = bam_get_cigar(rec_out);
327 
328     for (i = 0;  i < rec->core.n_cigar; i++) {
329         cig_op = bam_cigar_op(orig_cigar[i]);
330         cig_type = bam_cigar_type(cig_op);
331 
332         if (cig_op == BAM_CHARD_CLIP) {
333             hardclip += bam_cigar_oplen(orig_cigar[i]);
334         } else {
335             if (cig_type & 2) {
336                 if (bam_cigar_oplen(orig_cigar[i]) <= ref_remove) {
337                     ref_remove -= bam_cigar_oplen(orig_cigar[i]);
338                 } else {
339                     break;
340                 }
341                 new_pos += bam_cigar_oplen(orig_cigar[i]);
342             }
343             if (cig_type & 1) {
344                 qry_removed += bam_cigar_oplen(orig_cigar[i]);
345             }
346         }
347     }
348 
349     if (i < rec->core.n_cigar) {
350         cig_type = bam_cigar_type(bam_cigar_op(orig_cigar[i]));
351 
352         // account for the last operation
353         if (cig_type & 2) {
354             new_pos += ref_remove;
355         }
356         if (cig_type & 1) {
357             qry_removed += ref_remove;
358         }
359     } else {
360         qry_removed = rec->core.l_qseq;
361     }
362 
363     j = 0;
364     if (clipping == hard_clip && hardclip + qry_removed > 0) {
365         new_cigar[j++] = bam_cigar_gen(hardclip + qry_removed, BAM_CHARD_CLIP);
366     }
367     if (clipping == soft_clip) {
368         if (hardclip > 0) {
369             new_cigar[j++] = bam_cigar_gen(hardclip, BAM_CHARD_CLIP);
370         }
371         if (qry_removed > 0) {
372             new_cigar[j++] = bam_cigar_gen(qry_removed, BAM_CSOFT_CLIP);
373         }
374     }
375 
376     if (i < rec->core.n_cigar
377         && bam_cigar_oplen(orig_cigar[i]) > ref_remove) {
378         new_cigar[j++] = bam_cigar_gen(bam_cigar_oplen(orig_cigar[i]) - ref_remove, bam_cigar_op(orig_cigar[i]));
379 
380         // fill in the rest of the cigar
381         i++;
382 
383         for (; i < rec->core.n_cigar; i++) {
384             new_cigar[j++] = orig_cigar[i];
385         }
386     }
387 
388     rec_out->core.n_cigar = j;
389 
390     if (clipping == soft_clip) {
391         qry_removed = 0; // Copy all the sequence and confidence values
392         odd_base = 1; // account for an odd number of bases
393     }
394 
395     new_qual = bam_get_seq(rec_out) + (rec->core.l_qseq - qry_removed + 1) / 2;
396     // Copy remaining SEQ
397     if ((qry_removed & 1) == 0) {
398         memcpy(bam_get_seq(rec_out), orig_seq + (qry_removed / 2),
399                 (rec->core.l_qseq - qry_removed + odd_base) / 2);
400     } else {
401         uint8_t *in = orig_seq + qry_removed / 2;
402         uint8_t *out = bam_get_seq(rec_out);
403         uint32_t i;
404         for (i = qry_removed; i < rec->core.l_qseq - 1; i += 2) {
405             *out++ = ((in[0] & 0x0f) << 4) | ((in[1] & 0xf0) >> 4);
406             in++;
407         }
408         if (i < rec->core.l_qseq) {
409             *out++ = (in[0] & 0x0f) << 4;
410         }
411         assert(out == new_qual);
412     }
413 
414     // Copy remaining QUAL
415     memmove(new_qual, orig_qual, rec->core.l_qseq - qry_removed);
416 
417     // Set new l_qseq
418     rec_out->core.l_qseq -= qry_removed;
419 
420     // Move AUX
421     if (orig_l_aux)
422         memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
423 
424     // Set new l_data
425     rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
426 
427     // put in new pos
428     rec_out->core.pos = new_pos;
429 
430     return 0;
431 }
432 
433 
bam_trim_right(bam1_t * rec,bam1_t * rec_out,uint32_t bases,clipping_type clipping)434 static int bam_trim_right(bam1_t *rec, bam1_t *rec_out, uint32_t bases,
435                           clipping_type clipping) {
436     uint32_t *orig_cigar = bam_get_cigar(rec);
437     uint8_t *orig_seq = bam_get_seq(rec);
438     uint8_t *orig_qual = bam_get_qual(rec);
439     uint8_t *orig_aux = bam_get_aux(rec);
440     uint32_t *new_cigar;
441     uint32_t new_n_cigar = 0;
442     uint8_t *new_qual;
443     size_t orig_l_aux = bam_get_l_aux(rec);
444     int32_t i;
445     int32_t j;
446     uint32_t ref_remove = bases, qry_removed = 0, hardclip = 0;
447     uint32_t cig_type, cig_op;
448 
449     if (rec->l_data + 8 > rec_out->m_data) {
450         uint8_t *new_data = realloc(rec_out->data, rec->l_data + 8);
451         if (!new_data) {
452             fprintf(samtools_stderr, "[ampliconclip] error: could not allocate memoy for new bam record\n");
453             return 1;
454         }
455         rec_out->data = new_data;
456         rec_out->m_data = rec->l_data + 8;
457     }
458 
459     // Copy core data & name
460     memcpy(&rec_out->core, &rec->core, sizeof(rec->core));
461     memcpy(rec_out->data, rec->data, rec->core.l_qname);
462 
463     if (clipping == hard_clip && bases >= rec->core.l_qseq) {
464         rec_out->core.l_qseq = 0;
465         rec_out->core.n_cigar = 0;
466 
467         if (orig_l_aux)
468             memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
469 
470         rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
471         return 0;
472     }
473 
474     // Modify CIGAR here
475     new_cigar = bam_get_cigar(rec_out);
476 
477     for (i = rec->core.n_cigar - 1;  i >= 0; --i) {
478         cig_op = bam_cigar_op(orig_cigar[i]);
479         cig_type = bam_cigar_type(cig_op);
480 
481         if (cig_op == BAM_CHARD_CLIP) {
482             hardclip += bam_cigar_oplen(orig_cigar[i]);
483         } else {
484             if (cig_type & 2) {
485                 if (bam_cigar_oplen(orig_cigar[i]) <= ref_remove) {
486                     ref_remove -= bam_cigar_oplen(orig_cigar[i]);
487                 } else {
488                     break;
489                 }
490             }
491             if (cig_type & 1) {
492                 qry_removed += bam_cigar_oplen(orig_cigar[i]);
493             }
494         }
495     }
496 
497     if (i >= 0) {
498         cig_type = bam_cigar_type(bam_cigar_op(orig_cigar[i]));
499         if (cig_type & 1) {
500             qry_removed += ref_remove;
501         }
502         j = i;
503         if (qry_removed > 0) j++;
504         if (hardclip > 0 && (clipping == soft_clip || qry_removed == 0)) j++;
505     } else {
506         qry_removed = rec->core.l_qseq;
507         j = 0;
508         if (hardclip > 0 && clipping == soft_clip) j++;
509     }
510 
511     if (clipping == hard_clip && hardclip + qry_removed > 0) {
512         new_cigar[j] = bam_cigar_gen(hardclip + qry_removed, BAM_CHARD_CLIP);
513         new_n_cigar++;
514     }
515     if (clipping == soft_clip) {
516         if (hardclip > 0) {
517             new_cigar[j] = bam_cigar_gen(hardclip, BAM_CHARD_CLIP);
518             new_n_cigar++;
519             if (qry_removed > 0) --j;
520         }
521         if (qry_removed > 0) {
522             new_cigar[j] = bam_cigar_gen(qry_removed, BAM_CSOFT_CLIP);
523             new_n_cigar++;
524         }
525     }
526 
527     if (j > 0) {
528         new_cigar[--j] = bam_cigar_gen(bam_cigar_oplen(orig_cigar[i]) - ref_remove, bam_cigar_op(orig_cigar[i]));
529         new_n_cigar++;
530     }
531 
532     // fill in the rest of the cigar
533     while (j > 0) {
534         new_cigar[--j] = orig_cigar[--i];
535         new_n_cigar++;
536     }
537 
538     rec_out->core.n_cigar = new_n_cigar;
539 
540     if (clipping == soft_clip)
541         qry_removed = 0; // Copy all the sequence and confidence values
542 
543     new_qual = bam_get_seq(rec_out) + (rec->core.l_qseq - qry_removed + 1) / 2;
544     // Copy remaining SEQ
545     memcpy(bam_get_seq(rec_out), orig_seq, (rec->core.l_qseq - qry_removed + 1) / 2);
546 
547     // Copy remaining QUAL
548     memcpy(new_qual, orig_qual, rec->core.l_qseq - qry_removed);
549 
550     // Set new l_qseq
551     rec_out->core.l_qseq -= qry_removed;
552 
553     // Copy AUX
554     if (orig_l_aux)
555         memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
556 
557     // Set new l_data
558     rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
559 
560     return 0;
561 }
562 
563 
active_query_len(bam1_t * b)564 static hts_pos_t active_query_len(bam1_t *b) {
565     uint32_t *cigar = bam_get_cigar(b);
566     uint32_t cig_type, cig_op;
567     hts_pos_t len = 0;
568     int i;
569 
570     for (i = 0; i < b->core.n_cigar; i++) {
571         cig_op =  bam_cigar_op(cigar[i]);
572         cig_type = bam_cigar_type(cig_op);
573 
574         if ((cig_type & 1) && (cig_op != BAM_CSOFT_CLIP)) {
575             len += bam_cigar_oplen(cigar[i]);
576         }
577     }
578 
579     return len;
580 }
581 
582 
swap_bams(bam1_t ** a,bam1_t ** b)583 static inline void swap_bams(bam1_t **a, bam1_t **b) {
584     bam1_t *tmp = *a;
585     *a = *b;
586     *b = tmp;
587 }
588 
589 
590 // Format OA:Z:(RNAME,POS,strand,CIGAR,MAPQ,NM;
tag_original_data(bam1_t * orig,kstring_t * oa_tag)591 static inline int tag_original_data(bam1_t *orig, kstring_t *oa_tag) {
592     char strand;
593     uint8_t *nm_tag, *old_oa_tag;
594     uint32_t *cigar;
595     int64_t nm = 0;
596     int i, res = 0;
597 
598     ks_clear(oa_tag);
599 
600     // if there is an existing OA tag the new one gets appended to it
601     if ((old_oa_tag = bam_aux_get(orig, "OA"))) {
602         res |= ksprintf(oa_tag, "%s", bam_aux2Z(old_oa_tag)) < 0;
603     }
604 
605     if (orig->core.flag & BAM_FREVERSE)
606         strand = '-';
607     else
608         strand = '+';
609 
610     if ((nm_tag = bam_aux_get(orig, "NM"))) {
611         nm = bam_aux2i(nm_tag);
612     }
613 
614     res |= ksprintf(oa_tag, "%s,%"PRIhts_pos",%c,", bam_get_qname(orig), orig->core.pos + 1, strand) < 0;
615 
616     for (i = 0, cigar = bam_get_cigar(orig); i < orig->core.n_cigar && res == 0; ++i) {
617         res |= kputw(bam_cigar_oplen(cigar[i]), oa_tag) < 0;
618         res |= kputc(bam_cigar_opchr(cigar[i]), oa_tag) < 0;
619     }
620 
621     if (nm_tag) {
622         res |= ksprintf(oa_tag, ",%d,%"PRId64";", orig->core.qual, nm) < 0;
623     } else {
624         res |= ksprintf(oa_tag, "%d,;", orig->core.qual) < 0;
625     }
626 
627     return res;
628 }
629 
630 
bam_clip(samFile * in,samFile * out,samFile * reject,char * bedfile,clipping_type clipping,cl_param_t * param)631 static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile,
632                     clipping_type clipping, cl_param_t *param) {
633     int ret = 1, r, file_open = 0;
634 
635     bam_hdr_t *header = NULL;
636     bam1_t *b = NULL, *b_tmp = NULL;
637     long f_count = 0, r_count = 0, n_count = 0, l_count = 0, l_exclude = 0, b_count = 0;
638     long filtered = 0, written = 0, failed = 0;
639     kstring_t str = KS_INITIALIZE;
640     kstring_t oat = KS_INITIALIZE;
641     bed_entry_list_t *sites;
642     FILE *stats_fp = samtools_stderr;
643     khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
644 
645     if (load_bed_file_multi_ref(bedfile, param->use_strand, 1, bed_hash)) {
646         fprintf(samtools_stderr, "[ampliconclip] error: unable to load bed file.\n");
647         goto fail;
648     }
649 
650     if ((header = sam_hdr_read(in)) == NULL) {
651         fprintf(samtools_stderr, "[ampliconclip] error: could not read header\n");
652         goto fail;
653     }
654 
655     // changing pos can ruin coordinate sort order
656     if (sam_hdr_find_tag_hd(header, "SO", &str) == 0 && str.s && strcmp(str.s, "coordinate") == 0) {
657         const char *new_order = "unknown";
658 
659         if (sam_hdr_update_hd(header, "SO", new_order) == -1) {
660             fprintf(samtools_stderr, "[ampliconclip] error: unable to change sort order to 'SO:%s'\n", new_order);
661             goto fail;
662         }
663     }
664 
665     ks_free(&str);
666 
667     if (param->add_pg && sam_hdr_add_pg(header, "samtools", "VN", samtools_version(),
668                         param->arg_list ? "CL" : NULL,
669                         param->arg_list ? param->arg_list : NULL,
670                         NULL) != 0) {
671         fprintf(samtools_stderr, "[ampliconclip] warning: unable to add @PG line to header.\n");
672     }
673     if (sam_hdr_write(out, header) < 0) {
674         fprintf(samtools_stderr, "[ampliconclip] error: could not write header.\n");
675         goto fail;
676     }
677 
678     if (reject) {
679        if (sam_hdr_write(reject, header) < 0) {
680            fprintf(samtools_stderr, "[ampliconclip] error: could not write header to rejects file.\n");
681            goto fail;
682        }
683     }
684 
685     b = bam_init1();
686     b_tmp = bam_init1();
687     if (!b || !b_tmp) {
688         fprintf(samtools_stderr, "[ampliconclip] error: out of memory when trying to create record.\n");
689         goto fail;
690     }
691 
692     int32_t last_tid = -1;
693     int ref_found = 0;
694 
695     while ((r = sam_read1(in, header, b)) >= 0) {
696         hts_pos_t pos;
697         int is_rev;
698         int p_size;
699         int been_clipped  = 0, filter = 0;
700         int exclude = (BAM_FUNMAP | BAM_FQCFAIL);
701         khiter_t itr;
702 
703         l_count++;
704 
705         if (b->core.tid != last_tid) {
706             const char *ref_name;
707 
708             ref_found = 0;
709             last_tid = b->core.tid;
710 
711             if ((ref_name = sam_hdr_tid2name(header, b->core.tid)) != NULL) {
712                 itr = kh_get(bed_list_hash, bed_hash, ref_name);
713 
714                 if (itr != kh_end(bed_hash)) {
715                     sites = &kh_val(bed_hash, itr);
716                     ref_found = 1;
717                 }
718             }
719         }
720 
721         if (!(b->core.flag & exclude) && ref_found) {
722             if (param->oa_tag)
723                 if (tag_original_data(b, &oat))
724                     goto fail;
725 
726             if (!param->both) {
727                 if (bam_is_rev(b)) {
728                     pos = bam_endpos(b);
729                     is_rev = 1;
730                 } else {
731                     pos = b->core.pos;
732                     is_rev = 0;
733                 }
734 
735                 if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) {
736                     if (is_rev) {
737                         if (bam_trim_right(b, b_tmp, p_size, clipping) != 0)
738                             goto fail;
739 
740                         swap_bams(&b, &b_tmp);
741                         r_count++;
742                     } else {
743                         if (bam_trim_left(b, b_tmp, p_size, clipping) != 0)
744                             goto fail;
745 
746                         swap_bams(&b, &b_tmp);
747                         f_count++;
748                     }
749 
750                     if (param->oa_tag) {
751                         if (bam_aux_update_str(b, "OA", oat.l + 1, (const char *)oat.s))
752                             goto fail;
753                     }
754 
755                     if (param->del_tag) {
756                         uint8_t *tag;
757 
758                         if ((tag = bam_aux_get(b, "NM")))
759                             bam_aux_del(b, tag);
760 
761                         if ((tag = bam_aux_get(b, "MD")))
762                             bam_aux_del(b, tag);
763                     }
764 
765                     been_clipped = 1;
766                 } else {
767                     if (param->mark_fail) {
768                         b->core.flag |= BAM_FQCFAIL;
769                     }
770 
771                     n_count++;
772                 }
773             } else {
774                 int left = 0, right = 0;
775 
776                 // left first
777                 pos = b->core.pos;
778                 is_rev = 0;
779 
780                 if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) {
781                     if (bam_trim_left(b, b_tmp, p_size, clipping) != 0)
782                         goto fail;
783 
784                     swap_bams(&b, &b_tmp);
785                     f_count++;
786                     left = 1;
787                     been_clipped = 1;
788                 }
789 
790                 // the right
791                 pos = bam_endpos(b);
792                 is_rev = 1;
793 
794                 if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) {
795                     if (bam_trim_right(b, b_tmp, p_size, clipping) != 0)
796                         goto fail;
797 
798                     swap_bams(&b, &b_tmp);
799                     r_count++;
800                     right = 1;
801                     been_clipped = 1;
802                 }
803 
804                 if (left || right) {
805                     uint8_t *tag;
806 
807                     if (param->oa_tag) {
808                         if (bam_aux_update_str(b, "OA", oat.l + 1, (const char *)oat.s))
809                             goto fail;
810                     }
811 
812                     if (param->del_tag) {
813                         if ((tag = bam_aux_get(b, "NM")))
814                             bam_aux_del(b, tag);
815 
816                         if ((tag = bam_aux_get(b, "MD")))
817                             bam_aux_del(b, tag);
818                     }
819                 }
820 
821                 if (left && right) {
822                     b_count++;
823                 } else if (!left && !right) {
824                     if (param->mark_fail) {
825                         b->core.flag |= BAM_FQCFAIL;
826                     }
827 
828                     n_count++;
829                 }
830             }
831 
832             if (param->fail_len >= 0 || param->filter_len >= 0) {
833                hts_pos_t aql = active_query_len(b);
834 
835                if (param->fail_len >= 0 && aql <= param->fail_len) {
836                    b->core.flag |= BAM_FQCFAIL;
837                }
838 
839                if (param->filter_len >= 0 && aql <= param->filter_len) {
840                    filter = 1;
841                }
842            }
843 
844            if (b->core.flag & BAM_FQCFAIL) {
845                failed++;
846            }
847 
848            if (param->write_clipped && !been_clipped) {
849                filter = 1;
850            }
851 
852         } else {
853             l_exclude++;
854 
855             if (param->unmapped) {
856                 filter = 1;
857             }
858         }
859 
860         if (!filter) {
861             if (sam_write1(out, header, b) < 0) {
862                 fprintf(samtools_stderr, "[ampliconclip] error: could not write line %ld.\n", l_count);
863                 goto fail;
864             }
865 
866             written++;
867         } else {
868             if (reject) {
869                 if (sam_write1(reject, header, b) < 0) {
870                     fprintf(samtools_stderr, "[ampliconclip] error: could not write to reject file %s\n",
871                             param->rejects_file);
872                     goto fail;
873                 }
874             }
875 
876             filtered++;
877         }
878     }
879 
880     if (r < -1) {
881         fprintf(samtools_stderr, "[ampliconclip] error: failed to read input.\n");
882         goto fail;
883     }
884 
885     if (param->stats_file) {
886         if ((stats_fp = fopen(param->stats_file, "w")) == NULL) {
887             fprintf(samtools_stderr, "[ampliconclip] warning: cannot write stats to %s.\n", param->stats_file);
888         } else {
889             file_open = 1;
890         }
891     }
892 
893     fprintf(stats_fp, "COMMAND: %s\n"
894                     "TOTAL READS: %ld\n"
895                     "TOTAL CLIPPED: %ld\n"
896                     "FORWARD CLIPPED: %ld\n"
897                     "REVERSE CLIPPED: %ld\n"
898                     "BOTH CLIPPED: %ld\n"
899                     "NOT CLIPPED: %ld\n"
900                     "EXCLUDED: %ld\n"
901                     "FILTERED: %ld\n"
902                     "FAILED: %ld\n"
903                     "WRITTEN: %ld\n", param->arg_list, l_count, f_count + r_count,
904                                     f_count, r_count, b_count, n_count, l_exclude,
905                                     filtered, failed, written);
906 
907     if (file_open) {
908         fclose(stats_fp);
909     }
910 
911     ret = 0;
912 
913 fail:
914     destroy_bed_hash(bed_hash);
915     ks_free(&oat);
916     sam_hdr_destroy(header);
917     bam_destroy1(b);
918     bam_destroy1(b_tmp);
919     return ret;
920 }
921 
922 
usage(void)923 static void usage(void) {
924     fprintf(samtools_stderr, "Usage: samtools ampliconclip -b BED file <input.bam> -o <output.bam>\n\n");
925     fprintf(samtools_stderr, "Option: \n");
926     fprintf(samtools_stderr, " -b  FILE            BED file of regions (eg amplicon primers) to be removed.\n");
927     fprintf(samtools_stderr, " -o  FILE            output file name (default samtools_stdout).\n");
928     fprintf(samtools_stderr, " -f  FILE            write stats to file name (default samtools_stderr)\n");
929     fprintf(samtools_stderr, " -u                  Output uncompressed data\n");
930     fprintf(samtools_stderr, " --soft-clip         soft clip amplicon primers from reads (default)\n");
931     fprintf(samtools_stderr, " --hard-clip         hard clip amplicon primers from reads.\n");
932     fprintf(samtools_stderr, " --both-ends         clip on both 5' and 3' ends.\n");
933     fprintf(samtools_stderr, " --strand            use strand data from BED file to match read direction.\n");
934     fprintf(samtools_stderr, " --clipped           only output clipped reads.\n");
935     fprintf(samtools_stderr, " --fail              mark unclipped, mapped reads as QCFAIL.\n");
936     fprintf(samtools_stderr, " --filter-len INT    do not output reads INT size or shorter.\n");
937     fprintf(samtools_stderr, " --fail-len   INT    mark as QCFAIL reads INT size or shorter.\n");
938     fprintf(samtools_stderr, " --no-excluded       do not write excluded reads (unmapped or QCFAIL).\n");
939     fprintf(samtools_stderr, " --rejects-file FILE file to write filtered reads.\n");
940     fprintf(samtools_stderr, " --original          for clipped entries add an OA tag with original data.\n");
941     fprintf(samtools_stderr, " --keep-tag          for clipped entries keep the old NM and MD tags.\n");
942     fprintf(samtools_stderr, " --tolerance         match region within this number of bases, default 5.\n");
943     fprintf(samtools_stderr, " --no-PG             do not add an @PG line.\n");
944     sam_global_opt_help(samtools_stderr, "-.O..@-.");
945     fprintf(samtools_stderr, "\nAbout: Soft clips read alignments where they match BED file defined regions.\n"
946                     "Default clipping is only on the 5' end.\n\n");
947 }
948 
949 
amplicon_clip_main(int argc,char ** argv)950 int amplicon_clip_main(int argc, char **argv) {
951     int c, ret;
952     char wmode[4] = {'w', 'b', 0, 0};
953     char *bedfile = NULL, *fnout = "-";
954     sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
955     htsThreadPool p = {NULL, 0};
956     samFile *in = NULL, *out = NULL, *reject = NULL;
957     clipping_type clipping = soft_clip;
958     cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, NULL, NULL, NULL};
959 
960     static const struct option lopts[] = {
961         SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
962         {"no-PG", no_argument, NULL, 1002},
963         {"soft-clip", no_argument, NULL, 1003},
964         {"hard-clip", no_argument, NULL, 1004},
965         {"strand", no_argument, NULL, 1005},
966         {"clipped", no_argument, NULL, 1006},
967         {"fail", no_argument, NULL, 1007},
968         {"both-ends", no_argument, NULL, 1008},
969         {"filter-len", required_argument, NULL, 1009},
970         {"fail-len", required_argument, NULL, 1010},
971         {"no-excluded", no_argument, NULL, 1011},
972         {"rejects-file", required_argument, NULL, 1012},
973         {"original", no_argument, NULL, 1013},
974         {"keep-tag", no_argument, NULL, 1014},
975         {"tolerance", required_argument, NULL, 1015},
976         {NULL, 0, NULL, 0}
977     };
978 
979     while ((c = getopt_long(argc, argv, "b:@:o:O:f:u", lopts, NULL)) >= 0) {
980         switch (c) {
981             case 'b': bedfile = optarg; break;
982             case 'o': fnout = optarg; break;
983             case 'f': param.stats_file = optarg; break;
984             case 'u': wmode[2] = '0'; break;
985             case 1002: param.add_pg = 0; break;
986             case 1003: clipping = soft_clip; break;
987             case 1004: clipping = hard_clip; break;
988             case 1005: param.use_strand = 1; break;
989             case 1006: param.write_clipped = 1; break;
990             case 1007: param.mark_fail = 1; break;
991             case 1008: param.both = 1; break;
992             case 1009: param.filter_len = atoi(optarg); break;
993             case 1010: param.fail_len = atoi(optarg); break;
994             case 1011: param.unmapped = 1; break;
995             case 1012: param.rejects_file = optarg; break;
996             case 1013: param.oa_tag = 1; break;
997             case 1014: param.del_tag = 0; break;
998             case 1015: param.tol = atoi(optarg); break;
999             default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
1000                       /* else fall-through */
1001             case '?': usage(); samtools_exit(1);
1002         }
1003     }
1004 
1005     if (!bedfile) {
1006         usage();
1007         return 1;
1008     }
1009 
1010     if (optind + 1 > argc) {
1011         usage();
1012         return 1;
1013     }
1014 
1015     if (param.tol < 0) {
1016         fprintf(samtools_stderr, "[ampliconclip] warning: invalid tolerance of %d,"
1017                         " reseting tolerance to default of 5.\n", param.tol);
1018         param.tol = 5;
1019     }
1020 
1021     if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) {
1022         print_error_errno("ampliconclip", "cannot open input file");
1023         return 1;
1024     }
1025 
1026     sam_open_mode(wmode+1, fnout, NULL);
1027 
1028     if ((out = sam_open_format(fnout, wmode, &ga.out)) == NULL) {
1029         print_error_errno("ampliconclip", "cannot open output file");
1030         return 1;
1031     }
1032 
1033     if (param.rejects_file) {
1034         sam_open_mode(wmode+1, param.rejects_file, NULL);
1035 
1036         if ((reject = sam_open_format(param.rejects_file, wmode, &ga.out)) == NULL) {
1037             print_error_errno("ampliconclip", "cannot open rejects file");
1038             return 1;
1039         }
1040     }
1041 
1042     if (ga.nthreads > 0) {
1043         if (!(p.pool = hts_tpool_init(ga.nthreads))) {
1044             fprintf(samtools_stderr, "[ampliconclip] error: cannot create thread pool.\n");
1045             return 1;
1046         }
1047         hts_set_opt(in,  HTS_OPT_THREAD_POOL, &p);
1048         hts_set_opt(out, HTS_OPT_THREAD_POOL, &p);
1049 
1050         if (reject) {
1051            hts_set_opt(reject,  HTS_OPT_THREAD_POOL, &p);
1052         }
1053     }
1054 
1055     param.arg_list = stringify_argv(argc + 1, argv - 1);
1056 
1057     ret = bam_clip(in, out, reject, bedfile, clipping, &param);
1058 
1059     // cleanup
1060     sam_close(in);
1061 
1062     if (sam_close(out) < 0) {
1063         fprintf(samtools_stderr, "[ampliconclip] error: error while closing output file %s.\n", argv[optind+1]);
1064         ret = 1;
1065     }
1066 
1067     if (reject) {
1068         if (sam_close(reject) < 0) {
1069             fprintf(samtools_stderr, "[ampliconclip] error: error while closing reject file %s.\n", param.rejects_file);
1070             ret = 1;
1071         }
1072     }
1073 
1074     if (p.pool) hts_tpool_destroy(p.pool);
1075 
1076     sam_global_args_free(&ga);
1077     free(param.arg_list);
1078 
1079     return ret;
1080 }
1081 
1082