1 /*  bam_markdup.c -- Mark duplicates from a coord sorted file that has gone
2                      through fixmates with the mate scoring option on.
3 
4     Copyright (C) 2017-2021 Genome Research Ltd.
5 
6     Author: Andrew Whitwham <aw7@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE
25 
26 Estimate library size derived from Picard DuplicationMetrics.java
27 Copyright (c) 2009,2018 The Broad Institute.  MIT license.
28 */
29 
30 #include <config.h>
31 
32 #include <stdlib.h>
33 #include <string.h>
34 #include <stdio.h>
35 #include <unistd.h>
36 #include <ctype.h>
37 #include <time.h>
38 #include <sys/stat.h>
39 #include <math.h>
40 #include "htslib/thread_pool.h"
41 #include "htslib/sam.h"
42 #include "sam_opts.h"
43 #include "samtools.h"
44 #include "htslib/khash.h"
45 #include "htslib/klist.h"
46 #include "htslib/kstring.h"
47 #include "tmp_file.h"
48 
49 
50 typedef struct {
51     samFile *in;
52     samFile *out;
53     char *prefix;
54     int remove_dups;
55     int32_t max_length;
56     int do_stats;
57     int supp;
58     int tag;
59     int opt_dist;
60     int no_pg;
61     int clear;
62     int mode;
63     int write_index;
64     int include_fails;
65     int check_chain;
66     char *stats_file;
67     char *arg_list;
68     char *out_fn;
69 } md_param_t;
70 
71 typedef struct {
72     hts_pos_t this_coord;
73     hts_pos_t other_coord;
74     int32_t this_ref;
75     int32_t other_ref;
76     int8_t single;
77     int8_t leftmost;
78     int8_t orientation;
79 } key_data_t;
80 
81 typedef struct read_queue_s {
82     key_data_t pair_key;
83     key_data_t single_key;
84     bam1_t *b;
85     struct read_queue_s *duplicate;
86     hts_pos_t pos;
87     int dup_checked;
88 } read_queue_t;
89 
90 typedef struct {
91     read_queue_t *p;
92 } in_hash_t;
93 
94 typedef struct {
95     char *name;
96     char type;
97 } dup_map_t;
98 
99 typedef struct {
100     bam1_t *b;
101     int64_t score;
102     int64_t mate_score;
103     long x;
104     long y;
105     int opt;
106     int xpos;
107 } check_t;
108 
109 
110 typedef struct {
111     check_t *c;
112     size_t size;
113     size_t length;
114 } check_list_t;
115 
116 static khint32_t do_hash(unsigned char *key, khint32_t len);
117 
hash_key(key_data_t key)118 static khint_t hash_key(key_data_t key) {
119     int i = 0;
120     khint_t hash;
121 
122     if (key.single) {
123         unsigned char sig[13];
124 
125         memcpy(sig + i, &key.this_ref, 4);      i += 4;
126         memcpy(sig + i, &key.this_coord, 8);    i += 8;
127         memcpy(sig + i, &key.orientation, 1);   i += 1;
128 
129         hash = do_hash(sig, i);
130     } else {
131         unsigned char sig[26];
132 
133         memcpy(sig + i, &key.this_ref, 4);      i += 4;
134         memcpy(sig + i, &key.this_coord, 8);    i += 8;
135         memcpy(sig + i, &key.other_ref, 4);     i += 4;
136         memcpy(sig + i, &key.other_coord, 8);   i += 8;
137         memcpy(sig + i, &key.leftmost, 1);      i += 1;
138         memcpy(sig + i, &key.orientation, 1);   i += 1;
139 
140         hash = do_hash(sig, i);
141     }
142 
143     return hash;
144 }
145 
146 
key_equal(key_data_t a,key_data_t b)147 static int key_equal(key_data_t a, key_data_t b) {
148     int match = 1;
149 
150     if (a.this_coord != b.this_coord)
151         match = 0;
152     else if (a.orientation != b.orientation)
153         match = 0;
154     else if (a.this_ref != b.this_ref)
155         match = 0;
156     else if (a.single != b.single)
157         match = 0;
158 
159     if (!a.single) {
160         if (a.other_coord != b.other_coord)
161             match = 0;
162         else if (a.leftmost != b.leftmost)
163             match = 0;
164         else if (a.other_ref != b.other_ref)
165             match = 0;
166     }
167 
168     return match;
169 }
170 
171 
172 #define __free_queue_element(p)
173 
174 // Orientations (prime numbers to feed to hashing algorithm)
175 #define O_FF 2
176 #define O_RR 3
177 #define O_FR 5
178 #define O_RF 7
179 
180 // Left or rightmost
181 #define R_LE 11
182 #define R_RI 13
183 
184 #define BMD_WARNING_MAX 10
185 
186 #define MD_MIN_QUALITY 15
187 
188 // Duplicate finding mode
189 #define MD_MODE_TEMPLATE 0
190 #define MD_MODE_SEQUENCE 1
191 
192 KHASH_INIT(reads, key_data_t, in_hash_t, 1, hash_key, key_equal) // read map hash
KLIST_INIT(read_queue,read_queue_t,__free_queue_element)193 KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer
194 KHASH_MAP_INIT_STR(duplicates, dup_map_t) // map of duplicates for supplementary dup id
195 
196 
197 /* Calculate the mate's unclipped start based on position and cigar string from MC tag. */
198 
199 static hts_pos_t unclipped_other_start(hts_pos_t op, char *cigar) {
200     char *c = cigar;
201     int64_t clipped = 0;
202 
203     while (*c && *c != '*') {
204         long num = 0;
205 
206         if (isdigit((int)*c)) {
207             num = strtol(c, &c, 10);
208         } else {
209             num = 1;
210         }
211 
212         if (*c == 'S' || *c == 'H') { // clips
213             clipped += num;
214         } else {
215             break;
216         }
217 
218         c++;
219     }
220 
221     return op - clipped + 1;
222 }
223 
224 
225 /* Calculate the current read's start based on the stored cigar string. */
226 
unclipped_start(bam1_t * b)227 static hts_pos_t unclipped_start(bam1_t *b) {
228     uint32_t *cigar = bam_get_cigar(b);
229     int64_t clipped = 0;
230     uint32_t i;
231 
232     for (i = 0; i < b->core.n_cigar; i++) {
233         char c = bam_cigar_opchr(cigar[i]);
234 
235         if (c == 'S' || c == 'H') { // clips
236             clipped += bam_cigar_oplen(cigar[i]);
237         } else {
238             break;
239         }
240     }
241 
242     return b->core.pos - clipped + 1;
243 }
244 
245 
246 /* Calculate the mate's unclipped end based on start position and cigar string from MC tag.*/
247 
unclipped_other_end(int64_t op,char * cigar)248 static hts_pos_t unclipped_other_end(int64_t op, char *cigar) {
249     char *c = cigar;
250     int64_t refpos = 0;
251     int skip = 1;
252 
253     while (*c && *c != '*') {
254         long num = 0;
255 
256         if (isdigit((int)*c)) {
257             num = strtol(c, &c, 10);
258         } else {
259             num = 1;
260         }
261 
262         switch (*c) {
263             case 'M':
264             case 'D':
265             case 'N':
266             case '=':
267             case 'X':
268                 refpos += num;
269                 skip = 0; // ignore initial clips
270             break;
271 
272             case 'S':
273             case 'H':
274                 if (!skip) {
275                 refpos += num;
276             }
277             break;
278         }
279 
280         c++;
281    }
282 
283     return  op + refpos;
284 }
285 
286 
287 /* Calculate the current read's end based on the stored cigar string. */
288 
unclipped_end(bam1_t * b)289 static hts_pos_t unclipped_end(bam1_t *b) {
290     uint32_t *cigar = bam_get_cigar(b);
291     hts_pos_t end_pos, clipped = 0;
292     int32_t i;
293 
294     end_pos = bam_endpos(b);
295 
296     // now get the clipped end bases (if any)
297     // if we get to the beginning of the cigar string
298     // without hitting a non-clip then the results are meaningless
299     for (i = b->core.n_cigar - 1; i >= 0; i--) {
300         char c = bam_cigar_opchr(cigar[i]);
301 
302         if (c == 'S' || c == 'H') { // clips
303             clipped += bam_cigar_oplen(cigar[i]);
304         } else {
305             break;
306         }
307     }
308 
309     return end_pos + clipped;
310 }
311 
312 
313 /* The Bob Jenkins one_at_a_time hash to reduce the key to a 32 bit value. */
314 
do_hash(unsigned char * key,khint32_t len)315 static khint32_t do_hash(unsigned char *key, khint32_t len) {
316     khint32_t   hash, i;
317 
318     for (hash = 0, i = 0; i < len; ++i) {
319         hash += key[i];
320         hash += (hash << 10);
321         hash ^= (hash >> 6);
322     }
323 
324     hash += (hash << 3);
325     hash ^= (hash >> 11);
326     hash += (hash << 15);
327 
328     return hash;
329 }
330 
331 
332 /* Get mate score from tag. */
333 
get_mate_score(bam1_t * b)334 static int64_t get_mate_score(bam1_t *b) {
335     uint8_t *data;
336     int64_t score;
337 
338     if ((data = bam_aux_get(b, "ms"))) {
339         score = bam_aux2i(data);
340     } else {
341         fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
342         return -1;
343     }
344 
345     return score;
346 }
347 
348 
349 /* Calc current score from quality. */
350 
calc_score(bam1_t * b)351 static int64_t calc_score(bam1_t *b)
352 {
353     int64_t score = 0;
354     uint8_t  *qual = bam_get_qual(b);
355     int i;
356 
357     for (i = 0; i < b->core.l_qseq; i++) {
358         if (qual[i] >= MD_MIN_QUALITY) score += qual[i];
359     }
360 
361     return score;
362 }
363 
364 
365 /* Create a signature hash of the current read and its pair.
366    Uses the unclipped start (or end depending on orientation),
367    the reference id, orientation and whether the current
368    read is leftmost of the pair. */
369 
make_pair_key_template(key_data_t * key,bam1_t * bam)370 static int make_pair_key_template(key_data_t *key, bam1_t *bam) {
371     hts_pos_t this_coord, other_coord, this_end, other_end;
372     int32_t this_ref, other_ref;
373     int8_t orientation, leftmost;
374     uint8_t *data;
375     char *cig;
376 
377     this_ref    = bam->core.tid + 1; // avoid a 0 being put into the hash
378     other_ref   = bam->core.mtid + 1;
379 
380     this_coord = unclipped_start(bam);
381     this_end   = unclipped_end(bam);
382 
383     if ((data = bam_aux_get(bam, "MC"))) {
384         if (!(cig = bam_aux2Z(data))) {
385             fprintf(stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n");
386             return 1;
387         }
388 
389         other_end   = unclipped_other_end(bam->core.mpos, cig);
390         other_coord = unclipped_other_start(bam->core.mpos, cig);
391     } else {
392         fprintf(stderr, "[markdup] error: no MC tag. Please run samtools fixmate on file first.\n");
393         return 1;
394     }
395 
396     // work out orientations
397     if (this_ref != other_ref) {
398         leftmost = this_ref < other_ref;
399     } else {
400         if (bam_is_rev(bam) == bam_is_mrev(bam)) {
401             if (!bam_is_rev(bam)) {
402                 leftmost = this_coord <= other_coord;
403             } else {
404                 leftmost = this_end <= other_end;
405             }
406         } else {
407             if (bam_is_rev(bam)) {
408                 leftmost = this_end <= other_coord;
409             } else {
410                 leftmost = this_coord <= other_end;
411             }
412         }
413     }
414 
415     // pair orientation
416     if (leftmost) {
417         if (bam_is_rev(bam) == bam_is_mrev(bam)) {
418             other_coord = other_end;
419 
420             if (!bam_is_rev(bam)) {
421                 if (bam->core.flag & BAM_FREAD1) {
422                     orientation = O_FF;
423                 } else {
424                     orientation = O_RR;
425                 }
426             } else {
427                 if (bam->core.flag & BAM_FREAD1) {
428                     orientation = O_RR;
429                 } else {
430                     orientation = O_FF;
431                 }
432             }
433         } else {
434             if (!bam_is_rev(bam)) {
435                 orientation = O_FR;
436                 other_coord = other_end;
437             } else {
438                 orientation = O_RF;
439                 this_coord = this_end;
440             }
441         }
442     } else {
443         if (bam_is_rev(bam) == bam_is_mrev(bam)) {
444             this_coord = this_end;
445 
446             if (!bam_is_rev(bam)) {
447                 if (bam->core.flag & BAM_FREAD1) {
448                     orientation = O_RR;
449                 } else {
450                     orientation = O_FF;
451                 }
452             } else {
453                 if (bam->core.flag & BAM_FREAD1) {
454                     orientation = O_FF;
455                 } else {
456                     orientation = O_RR;
457                 }
458             }
459         } else {
460             if (!bam_is_rev(bam)) {
461                 orientation = O_RF;
462                 other_coord = other_end;
463             } else {
464                 orientation = O_FR;
465                 this_coord = this_end;
466             }
467         }
468     }
469 
470     if (!leftmost)
471         leftmost = R_RI;
472     else
473         leftmost = R_LE;
474 
475     key->single        = 0;
476     key->this_ref      = this_ref;
477     key->this_coord    = this_coord;
478     key->other_ref     = other_ref;
479     key->other_coord   = other_coord;
480     key->leftmost      = leftmost;
481     key->orientation   = orientation;
482 
483     return 0;
484 }
485 
486 
make_pair_key_sequence(key_data_t * key,bam1_t * bam)487 static int make_pair_key_sequence(key_data_t *key, bam1_t *bam) {
488     hts_pos_t this_coord, this_end, other_coord, other_end, leftmost;
489     int32_t this_ref, other_ref;
490     int8_t orientation, left_read;
491     uint8_t *data;
492     char *cig;
493 
494     this_ref    = bam->core.tid + 1; // avoid a 0 being put into the hash
495     other_ref   = bam->core.mtid + 1;
496 
497     this_coord = unclipped_start(bam);
498     this_end   = unclipped_end(bam);
499 
500     if ((data = bam_aux_get(bam, "MC"))) {
501         if (!(cig = bam_aux2Z(data))) {
502             fprintf(stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n");
503             return 1;
504         }
505 
506         other_end   = unclipped_other_end(bam->core.mpos, cig);
507         other_coord = unclipped_other_start(bam->core.mpos, cig);
508     } else {
509         fprintf(stderr, "[markdup] error: no MC tag. Please run samtools fixmate on file first.\n");
510         return 1;
511     }
512 
513     // work out orientations
514     if (this_ref != other_ref) {
515         leftmost = this_ref - other_ref;
516     } else {
517         if (bam_is_rev(bam) == bam_is_mrev(bam)) {
518             if (!bam_is_rev(bam)) {
519                 leftmost = this_coord - other_coord;
520             } else {
521                 leftmost = this_end - other_end;
522             }
523         } else {
524             if (bam_is_rev(bam)) {
525                 leftmost = this_end - other_coord;
526             } else {
527                 leftmost = this_coord - other_end;
528             }
529         }
530     }
531 
532     if (leftmost < 0) {
533         leftmost = 1;
534     } else if (leftmost > 0) {
535         leftmost = 0;
536     } else {
537         // tie breaks
538 
539         if (bam->core.pos == bam->core.mpos) {
540             if (bam->core.flag & BAM_FREAD1) {
541                 leftmost = 1;
542             } else {
543                 leftmost = 0;
544             }
545         } else if (bam->core.pos < bam->core.mpos) {
546             leftmost = 1;
547         } else {
548             leftmost = 0;
549         }
550     }
551 
552     // pair orientation
553     if (leftmost) {
554         if (bam_is_rev(bam) == bam_is_mrev(bam)) {
555 
556             if (!bam_is_rev(bam)) {
557                     orientation = O_FF;
558             } else {
559                     orientation = O_RR;
560             }
561         } else {
562             if (!bam_is_rev(bam)) {
563                 orientation = O_FR;
564             } else {
565                 orientation = O_RF;
566             }
567         }
568     } else {
569         if (bam_is_rev(bam) == bam_is_mrev(bam)) {
570 
571             if (!bam_is_rev(bam)) {
572                     orientation = O_RR;
573             } else {
574                     orientation = O_FF;
575             }
576         } else {
577             if (!bam_is_rev(bam)) {
578                 orientation = O_RF;
579             } else {
580                 orientation = O_FR;
581             }
582         }
583     }
584 
585     if (!leftmost)
586         left_read = R_RI;
587     else
588         left_read = R_LE;
589 
590     if (!bam_is_rev(bam)) {
591         this_coord = unclipped_start(bam);
592     } else {
593         this_coord = unclipped_end(bam);
594     }
595 
596     if (!bam_is_mrev(bam)) {
597         other_coord = unclipped_other_start(bam->core.mpos, cig);
598     } else {
599         other_coord = unclipped_other_end(bam->core.mpos, cig);
600     }
601 
602     key->single        = 0;
603     key->this_ref      = this_ref;
604     key->this_coord    = this_coord;
605     key->other_ref     = other_ref;
606     key->other_coord   = other_coord;
607     key->leftmost      = left_read;
608     key->orientation   = orientation;
609 
610     return 0;
611 }
612 
613 /* Create a signature hash of single read (or read with an unmatched pair).
614    Uses unclipped start (or end depending on orientation), reference id,
615    and orientation. */
616 
make_single_key(key_data_t * key,bam1_t * bam)617 static void make_single_key(key_data_t *key, bam1_t *bam) {
618     hts_pos_t this_coord;
619     int32_t this_ref;
620     int8_t orientation;
621 
622     this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash
623 
624     if (bam_is_rev(bam)) {
625         this_coord = unclipped_end(bam);
626         orientation = O_RR;
627     } else {
628         this_coord = unclipped_start(bam);
629         orientation = O_FF;
630     }
631 
632     key->single        = 1;
633     key->this_ref      = this_ref;
634     key->this_coord    = this_coord;
635     key->orientation   = orientation;
636 }
637 
638 
639 /* Add the duplicate name to a hash if it does not exist. */
640 
add_duplicate(khash_t (duplicates)* d_hash,bam1_t * dupe,char * orig_name,char type)641 static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_name, char type) {
642     khiter_t d;
643     int ret;
644 
645     d = kh_get(duplicates, d_hash, bam_get_qname(dupe));
646 
647     if (d == kh_end(d_hash)) {
648         char *name = strdup(bam_get_qname(dupe));
649         if (name) {
650             d = kh_put(duplicates, d_hash, name, &ret);
651         } else {
652             ret = -1;
653         }
654 
655         if (ret >= 0) {
656             if (orig_name) {
657                 if (ret == 0) {
658                     // replace old name
659                     free(kh_value(d_hash, d).name);
660                     free(name);
661                 }
662 
663                 kh_value(d_hash, d).name = strdup(orig_name);
664 
665                 if (kh_value(d_hash, d).name == NULL) {
666                     fprintf(stderr, "[markdup] error: unable to allocate memory for duplicate original name.\n");
667                     return 1;
668                 }
669             } else {
670                 kh_value(d_hash, d).name = NULL;
671             }
672 
673             kh_value(d_hash, d).type = type;
674         } else {
675             fprintf(stderr, "[markdup] error: unable to store supplementary duplicates.\n");
676             free(name);
677             return 1;
678         }
679     }
680 
681     return 0;
682 }
683 
684 
685 /* Get the position of the coordinates from the read name. */
get_coordinate_positions(const char * qname,int * xpos,int * ypos)686 static inline int get_coordinate_positions(const char *qname, int *xpos, int *ypos) {
687     int sep = 0;
688     int pos = 0;
689 
690     while (qname[pos]) {
691         if (qname[pos] == ':') {
692             sep++;
693 
694             if (sep == 2) {
695                 *xpos = pos + 1;
696             } else if (sep == 3) {
697                 *ypos = pos + 1;
698             } else if (sep == 4) { // HiSeq style names
699                 *xpos = *ypos;
700                 *ypos = pos + 1;
701             } else if (sep == 5) { // Newer Illumina format
702                 *xpos = pos + 1;
703             } else if (sep == 6) {
704                 *ypos = pos + 1;
705             }
706         }
707 
708         pos++;
709     }
710 
711     return sep;
712 }
713 
714 
get_coordinates(const char * name,int * xpos_out,long * x_coord,long * y_coord,long * warnings)715 static int get_coordinates(const char *name, int *xpos_out, long *x_coord, long *y_coord, long *warnings) {
716     int ret = 1;
717     int seps, xpos = 0, ypos = 0;
718     long x = 0, y = 0;
719     char *end;
720 
721     seps = get_coordinate_positions(name, &xpos, &ypos);
722 
723     /* The most current Illumina read format at time of writing is:
724        @machine:run:flowcell:lane:tile:x:y:UMI or
725        @machine:run:flowcell:lane:tile:x:y
726 
727        Counting the separating colons gives us a quick format check.
728        Older name formats have fewer elements.
729     */
730 
731     if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) {
732         (*warnings)++;
733 
734         if (*warnings <= BMD_WARNING_MAX) {
735             fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", name);
736         }
737 
738         return ret;
739     }
740 
741     x = strtol(name + xpos, &end, 10);
742 
743     if ((name + xpos) == end) {
744         (*warnings)++;
745 
746         if (*warnings <= BMD_WARNING_MAX) {
747             fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", name);
748         }
749 
750         return ret;
751     }
752 
753     y = strtol(name + ypos, &end, 10);
754 
755     if ((name + ypos) == end) {
756         (*warnings)++;
757 
758         if (*warnings <= BMD_WARNING_MAX) {
759             fprintf(stderr, "[markdup] warning: can not decipher y coordinate in %s .\n", name);
760         }
761 
762         return ret;
763     }
764 
765     *x_coord = x;
766     *y_coord = y;
767     *xpos_out = xpos;
768     ret = 0;
769 
770     return ret;
771 }
772 
773 
774 /* Using the coordinates from the Illumina read name, see whether the duplicated read is
775    close enough (set by max_dist) to the original to be counted as optical.*/
776 
optical_duplicate(bam1_t * ori,bam1_t * dup,long max_dist,long * warnings)777 static int optical_duplicate(bam1_t *ori, bam1_t *dup, long max_dist, long *warnings) {
778     int ret = 0, seps;
779     char *original, *duplicate;
780     int oxpos = 0, oypos = 0, dxpos = 0, dypos = 0;
781 
782 
783     original  = bam_get_qname(ori);
784     duplicate = bam_get_qname(dup);
785 
786     seps = get_coordinate_positions(original, &oxpos, &oypos);
787 
788     if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) {
789         (*warnings)++;
790 
791         if (*warnings <= BMD_WARNING_MAX) {
792             fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", original);
793         }
794 
795         return ret;
796     }
797 
798     seps = get_coordinate_positions(duplicate, &dxpos, &dypos);
799 
800     if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) {
801 
802         (*warnings)++;
803 
804         if (*warnings <= BMD_WARNING_MAX) {
805             fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", duplicate);
806         }
807 
808         return ret;
809     }
810 
811     if (strncmp(original, duplicate, oxpos - 1) == 0) {
812         // the initial parts match, look at the numbers
813         long ox, oy, dx, dy, xdiff, ydiff;
814         char *end;
815 
816         ox = strtol(original + oxpos, &end, 10);
817 
818         if ((original + oxpos) == end) {
819             (*warnings)++;
820 
821             if (*warnings <= BMD_WARNING_MAX) {
822                 fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", original);
823             }
824 
825             return ret;
826         }
827 
828         dx = strtol(duplicate + dxpos, &end, 10);
829 
830         if ((duplicate + dxpos) == end) {
831             (*warnings)++;
832 
833             if (*warnings <= BMD_WARNING_MAX) {
834                 fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s.\n", duplicate);
835             }
836 
837             return ret;
838         }
839 
840         if (ox > dx) {
841             xdiff = ox - dx;
842         } else {
843             xdiff = dx - ox;
844         }
845 
846         if (xdiff <= max_dist) {
847             // still might be optical
848 
849             oy = strtol(original + oypos, &end, 10);
850 
851             if ((original + oypos) == end) {
852                 (*warnings)++;
853 
854                 if (*warnings <= BMD_WARNING_MAX) {
855                     fprintf(stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", original);
856                 }
857 
858                 return ret;
859             }
860 
861             dy = strtol(duplicate + dypos, &end, 10);
862 
863             if ((duplicate + dypos) == end) {
864                 (*warnings)++;
865 
866                 if (*warnings <= BMD_WARNING_MAX) {
867                     fprintf(stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", duplicate);
868                 }
869 
870                 return ret;
871             }
872 
873             if (oy > dy) {
874                 ydiff = oy - dy;
875             } else {
876                 ydiff = dy - oy;
877             }
878 
879             if (ydiff <= max_dist) ret = 1;
880         }
881     }
882 
883     return ret;
884 }
885 
886 
887 /* Using the coordinates from the Illumina read name, see whether the duplicated read is
888    close enough (set by max_dist) to the original to be counted as optical.
889 
890    This function needs the values from the first read to be already calculated. */
891 
optical_duplicate_partial(const char * name,const int oxpos,const long ox,const long oy,bam1_t * dup,check_t * c,long max_dist,long * warnings)892 static int optical_duplicate_partial(const char *name, const int oxpos, const long ox, const long oy, bam1_t *dup, check_t *c, long max_dist, long *warnings) {
893     int ret = 0;
894     char *duplicate;
895     int dxpos = 0;
896     long dx, dy;
897 
898     duplicate = bam_get_qname(dup);
899 
900     if (get_coordinates(duplicate, &dxpos, &dx, &dy, warnings)) {
901         return ret;
902     }
903 
904     if (strncmp(name, duplicate, oxpos - 1) == 0) {
905         // the initial parts match, look at the numbers
906         long xdiff, ydiff;
907 
908         if (ox > dx) {
909             xdiff = ox - dx;
910         } else {
911             xdiff = dx - ox;
912         }
913 
914         if (xdiff <= max_dist) {
915             // still might be optical
916 
917             if (oy > dy) {
918                 ydiff = oy - dy;
919             } else {
920                 ydiff = dy - oy;
921             }
922 
923             if (ydiff <= max_dist) ret = 1;
924         }
925     }
926 
927     c->x = dx;
928     c->y = dy;
929     c->xpos = dxpos;
930 
931     return ret;
932 }
933 
934 
935 /* Mark the read as a duplicate and update the duplicate hash (if needed) */
mark_duplicates(md_param_t * param,khash_t (duplicates)* dup_hash,bam1_t * ori,bam1_t * dup,long * optical,long * warn)936 static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *ori, bam1_t *dup,
937                            long *optical, long *warn) {
938     char dup_type = 0;
939     long incoming_warnings = *warn;
940 
941     dup->core.flag |= BAM_FDUP;
942 
943     if (param->tag) {
944         if (bam_aux_update_str(dup, "do", strlen(bam_get_qname(ori)) + 1, bam_get_qname(ori))) {
945             fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n");
946             return -1;
947         }
948     }
949 
950     if (param->opt_dist) { // mark optical duplicates
951         if (optical_duplicate(ori, dup, param->opt_dist, warn)) {
952             bam_aux_update_str(dup, "dt", 3, "SQ");
953             dup_type = 'O';
954             (*optical)++;
955         } else {
956             // not an optical duplicate
957             bam_aux_update_str(dup, "dt", 3, "LB");
958         }
959     }
960 
961     if ((*warn == BMD_WARNING_MAX) && (incoming_warnings != *warn)) {
962         fprintf(stderr, "[markdup] warning: %ld decipher read name warnings.  New warnings will not be reported.\n",
963                         *warn);
964     }
965 
966     if (param->supp) {
967         if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP) || bam_aux_get(dup, "XA")) {
968             char *original = NULL;
969 
970             if (param->tag) {
971                 original = bam_get_qname(ori);
972             }
973 
974             if (add_duplicate(dup_hash, dup, original, dup_type))
975                 return -1;
976         }
977     }
978 
979     return 0;
980 }
981 
982 
983 /* If the duplicate type has changed to optical then retag and duplicate hash. */
optical_retag(md_param_t * param,khash_t (duplicates)* dup_hash,bam1_t * b,int paired,long * optical_single,long * optical_pair)984 static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, long *optical_single, long *optical_pair) {
985     int ret = 0;
986 
987     if (bam_aux_update_str(b, "dt", 3, "SQ")) {
988         fprintf(stderr, "[markdup] error: unable to update 'dt' tag.\n");
989         ret = -1;
990     }
991 
992     if (paired) {
993         (*optical_pair)++;
994     } else {
995         (*optical_single)++;
996     }
997 
998     if (param->supp) {
999         // Change the duplicate type
1000 
1001         if (bam_aux_get(b, "SA") || (b->core.flag & BAM_FMUNMAP)
1002             || bam_aux_get(b, "XA")) {
1003             khiter_t d;
1004 
1005             d = kh_get(duplicates, dup_hash, bam_get_qname(b));
1006 
1007             if (d == kh_end(dup_hash)) {
1008                 // error, name should already be in dup hash
1009                 fprintf(stderr, "[markdup] error: duplicate name %s not found in hash.\n",
1010                     bam_get_qname(b));
1011                 ret = -1;
1012             } else {
1013                 kh_value(dup_hash, d).type = 'O';
1014             }
1015         }
1016     }
1017 
1018     return ret;
1019 }
1020 
1021 
1022 /* Check all duplicates of the highest quality read (the "original") for consistancy.  Also
1023    pre-calculate any values for use in check_duplicate_chain later.
1024    Returns 0 on success, >0 on coordinate reading error (program can continue) or
1025    <0 on an error (program should not continue. */
check_chain_against_original(md_param_t * param,khash_t (duplicates)* dup_hash,read_queue_t * ori,check_list_t * list,long * warn,long * optical_single,long * optical_pair)1026 static int check_chain_against_original(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori,
1027              check_list_t *list, long *warn, long *optical_single, long *optical_pair) {
1028 
1029     int ret = 0;
1030     char *ori_name = bam_get_qname(ori->b);
1031     read_queue_t *current = ori->duplicate;
1032     int xpos;
1033     long x, y;
1034 
1035     if (param->opt_dist) {
1036         if ((ret = get_coordinates(ori_name, &xpos, &x, &y, warn))) {
1037             return ret;
1038         }
1039     }
1040 
1041     list->length = 0;
1042 
1043     while (current) {
1044         check_t *c;
1045 
1046         if (list->length >= list->size) {
1047             check_t *tmp;
1048 
1049             list->size *= 2;
1050 
1051             if (!(tmp = realloc(list->c, list->size * sizeof(check_t)))) {
1052                 fprintf(stderr, "[markdup] error: Unable to expand opt check list.\n");
1053                 return -1;
1054             }
1055 
1056             list->c = tmp;
1057         }
1058 
1059         c = &list->c[list->length];
1060 
1061         c->b = current->b;
1062         c->x = -1;
1063         c->y = -1;
1064         c->opt = 0;
1065         c->score = 0;
1066         c->mate_score = 0;
1067         current->dup_checked = 1;
1068 
1069         if (param->tag) {
1070             uint8_t *data;
1071 
1072             // at this stage all duplicates should have a do tag
1073             if ((data = bam_aux_get(current->b, "do")) != NULL) {
1074                 // see if we need to change the tag
1075                 char *old_name = bam_aux2Z(data);
1076 
1077                 if (old_name) {
1078                     if (strcmp(old_name, ori_name) != 0) {
1079                         if (bam_aux_update_str(current->b, "do", strlen(ori_name) + 1, (const char *)ori_name)) {
1080                             fprintf(stderr, "[markdup] error: unable to update 'do' tag.\n");
1081                             ret =  -1;
1082                             break;
1083                         }
1084                     }
1085                 } else {
1086                     fprintf(stderr, "[markdup] error: 'do' tag has wrong type for read %s.\n", bam_get_qname(current->b));
1087                     ret = -1;
1088                     break;
1089                 }
1090             }
1091         }
1092 
1093         if (param->opt_dist) {
1094             uint8_t *data;
1095             char *dup_type;
1096             int is_opt = 0;
1097             int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
1098 
1099             if ((data = bam_aux_get(current->b, "dt"))) {
1100                 if ((dup_type = bam_aux2Z(data))) {
1101                     if (strcmp(dup_type, "SQ") == 0) {
1102                         c->opt = 1;
1103                     }
1104                 }
1105             }
1106 
1107             // need to run this to get the duplicates x and y scores
1108             is_opt = optical_duplicate_partial(ori_name, xpos, x, y, current->b, c, param->opt_dist, warn);
1109 
1110             if (!c->opt && is_opt) {
1111                 if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
1112                     ret = -1;
1113                     break;
1114                 }
1115 
1116                 c->opt = 1;
1117             }
1118 
1119             c->score = calc_score(current->b);
1120 
1121             if (current_paired) {
1122                 if ((c->mate_score = get_mate_score(current->b)) == -1) {
1123                      fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
1124                      ret = -1;
1125                      break;
1126                 }
1127             }
1128         }
1129 
1130         current = current->duplicate;
1131         list->length++;
1132     }
1133 
1134     return ret;
1135 }
1136 
1137 
xcoord_sort(const void * a,const void * b)1138 static int xcoord_sort(const void *a, const void *b) {
1139     check_t *ac = (check_t *) a;
1140     check_t *bc = (check_t *) b;
1141 
1142     return (ac->x - bc->x);
1143 }
1144 
1145 
1146 /* Check all the duplicates against each other to see if they are optical duplicates. */
check_duplicate_chain(md_param_t * param,khash_t (duplicates)* dup_hash,check_list_t * list,long * warn,long * optical_single,long * optical_pair)1147 static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_hash, check_list_t *list,
1148              long *warn, long *optical_single, long *optical_pair) {
1149     int ret = 0;
1150     size_t curr = 0;
1151 
1152     qsort(list->c, list->length, sizeof(list->c[0]), xcoord_sort);
1153 
1154     while (curr < list->length - 1) {
1155         check_t *current = &list->c[curr];
1156         size_t count = curr;
1157         char *cur_name = bam_get_qname(current->b);
1158         int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
1159 
1160         while (++count < list->length && (list->c[count].x - current->x <= param->opt_dist)) {
1161             // while close enough along the x coordinate
1162             check_t *chk = &list->c[count];
1163 
1164             if (current->opt && chk->opt)
1165                 continue;
1166 
1167             // if both are already optical duplicates there is no need to check again, otherwise...
1168 
1169             long ydiff;
1170 
1171             if (current->y > chk->y) {
1172                 ydiff = current->y - chk->y;
1173             } else {
1174                 ydiff = chk->y - current->y;
1175             }
1176 
1177             if (ydiff > param->opt_dist)
1178                 continue;
1179 
1180             // the number are right, check the names
1181             if (strncmp(cur_name, bam_get_qname(chk->b), current->xpos - 1) != 0)
1182                 continue;
1183 
1184             // optical duplicates
1185             int chk_dup = 0;
1186             int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP);
1187 
1188             if (current_paired != chk_paired) {
1189                 if (!chk_paired) {
1190                     // chk is single vs pair, this is a dup.
1191                     chk_dup = 1;
1192                 }
1193             } else {
1194                 // do it by scores
1195                 int64_t cur_score, chk_score;
1196 
1197                 if ((current->b->core.flag & BAM_FQCFAIL) != (chk->b->core.flag & BAM_FQCFAIL)) {
1198                     if (current->b->core.flag & BAM_FQCFAIL) {
1199                         cur_score = 0;
1200                         chk_score = 1;
1201                     } else {
1202                         cur_score = 1;
1203                         chk_score = 0;
1204                     }
1205                 } else {
1206                     cur_score = current->score;
1207                     chk_score = chk->score;
1208 
1209                     if (current_paired) {
1210                         // they are pairs so add mate scores.
1211                         chk_score += chk->mate_score;
1212                         cur_score += current->mate_score;
1213                     }
1214                 }
1215 
1216                 if (cur_score == chk_score) {
1217                     if (strcmp(bam_get_qname(chk->b), cur_name) < 0) {
1218                         chk_score++;
1219                     } else {
1220                         chk_score--;
1221                     }
1222                 }
1223 
1224                 if (cur_score > chk_score) {
1225                     chk_dup = 1;
1226                 }
1227             }
1228 
1229             if (chk_dup) {
1230                 // the duplicate is the optical duplicate
1231                 if (!chk->opt) { // only change if not already an optical duplicate
1232                     if (optical_retag(param, dup_hash, chk->b, chk_paired, optical_single, optical_pair)) {
1233                         ret = -1;
1234                         goto fail;
1235                     }
1236 
1237                     chk->opt = 1;
1238                 }
1239             } else {
1240                 if (!current->opt) {
1241                     if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
1242                         ret = -1;
1243                         goto fail;
1244                     }
1245 
1246                     current->opt = 1;
1247                 }
1248             }
1249         }
1250 
1251         curr++;
1252     }
1253 
1254  fail:
1255     return ret;
1256 }
1257 
1258 
1259 /* Where there is more than one duplicate go down the list and check for optical duplicates and change
1260    do tags (where used) to point to original (non-duplicate) read. */
find_duplicate_chains(md_param_t * param,klist_t (read_queue)* read_buffer,khash_t (duplicates)* dup_hash,check_list_t * dup_list,const hts_pos_t prev_coord,const int32_t prev_tid,long * warn,long * optical_single,long * optical_pair,const int check_range)1261 static int find_duplicate_chains(md_param_t *param, klist_t(read_queue) *read_buffer, khash_t(duplicates) *dup_hash, check_list_t *dup_list,
1262                                 const hts_pos_t prev_coord, const int32_t prev_tid, long *warn, long *optical_single,
1263                                 long *optical_pair, const int check_range) {
1264     int ret = 0;
1265     kliter_t(read_queue) *rq;
1266 
1267     rq = kl_begin(read_buffer);
1268 
1269     while (rq != kl_end(read_buffer)) {
1270         read_queue_t *in_read = &kl_val(rq);
1271 
1272         if (check_range) {
1273             /* Just check against the moving window of reads based on coordinates and max read length. */
1274             if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) {
1275                 break;
1276             }
1277         } else {
1278             // this is the last set of results and the end entry will be blank
1279             if (!bam_get_qname(in_read->b)) {
1280                 break;
1281             }
1282         }
1283 
1284         if (!(in_read->b->core.flag & BAM_FDUP) && in_read->duplicate) { // is the head of a duplicate chain
1285 
1286             // check against the original for tagging and optical duplication
1287             if ((ret = check_chain_against_original(param, dup_hash, in_read, dup_list, warn, optical_single, optical_pair))) {
1288                 if (ret < 0) { // real error
1289                     ret = -1;
1290                     break;
1291                 } else { // coordinate decoding error
1292                     ret = 0;
1293                     in_read->duplicate = NULL;
1294                     continue;
1295                 }
1296             }
1297 
1298             // check the rest of the duplicates against each other for optical duplication
1299             if (param->opt_dist && check_duplicate_chain(param, dup_hash, dup_list, warn, optical_single, optical_pair)) {
1300                 ret = -1;
1301                 break;
1302             }
1303 
1304             in_read->duplicate = NULL;
1305         }
1306 
1307         rq = kl_next(rq);
1308     }
1309 
1310     return ret;
1311 }
1312 
1313 
1314 /*
1315   Function to use when estimating library size.
1316 
1317   This is based on an approximate formula for the coverage of a set
1318   obtained after sampling it a given number of times with replacement.
1319 
1320   x = number of items in the set (the number of unique fragments in the library)
1321 
1322   c = number of unique items (unique read pairs observed)
1323 
1324   n = number of items samples (total number of read pairs)
1325 
1326   c and n are known; x is unknown.
1327 
1328   As n -> infinity, the coverage (c/x) can be given as:
1329 
1330   c / x = 1 - exp(-n / x)  (see https://math.stackexchange.com/questions/32800)
1331 
1332   This needs to be solved for x, so it is rearranged to put both terms on the
1333   left side and estimate_library_size() finds a value of x which gives a
1334   result of zero (or as close as it can get).
1335  */
coverage_equation(double x,double c,double n)1336 static inline double coverage_equation(double x, double c, double n) {
1337     return c / x - 1 + exp(-n / x);
1338 }
1339 
1340 
1341 /* estimate the library size, based on the Picard code in DuplicationMetrics.java*/
estimate_library_size(unsigned long paired_reads,unsigned long paired_duplicate_reads,unsigned long optical)1342 static unsigned long estimate_library_size(unsigned long paired_reads, unsigned long paired_duplicate_reads, unsigned long optical) {
1343     unsigned long estimated_size = 0;
1344     unsigned long non_optical_pairs = (paired_reads - optical) / 2;
1345     unsigned long unique_pairs = (paired_reads - paired_duplicate_reads) / 2;
1346     unsigned long duplicate_pairs = (paired_duplicate_reads - optical) / 2;
1347 
1348     if ((non_optical_pairs && duplicate_pairs && unique_pairs) && (non_optical_pairs > duplicate_pairs)) {
1349         double m = 1;
1350         double M = 100;
1351         int i;
1352 
1353         if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) < 0) {
1354             fprintf(stderr, "[markdup] warning: unable to calculate estimated library size.\n");
1355             return  estimated_size;
1356         }
1357 
1358         while (coverage_equation(M * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) > 0) {
1359             M *= 10;
1360         }
1361 
1362         for (i = 0; i < 40; i++) {
1363             double r = (m + M) / 2;
1364             double u = coverage_equation(r * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs);
1365 
1366             if (u > 0) {
1367                 m = r;
1368             } else if (u < 0) {
1369                 M = r;
1370             } else {
1371                 break;
1372             }
1373         }
1374 
1375         estimated_size = (unsigned long)(unique_pairs * (m + M) / 2);
1376     } else {
1377         fprintf(stderr, "[markdup] warning: unable to calculate estimated library size."
1378                         " Read pairs %ld should be greater than duplicate pairs %ld,"
1379                         " which should both be non zero.\n",
1380                         non_optical_pairs, duplicate_pairs);
1381     }
1382 
1383     return estimated_size;
1384 }
1385 
1386 
1387 /* Compare the reads near each other (coordinate sorted) and try to spot the duplicates.
1388    Generally the highest quality scoring is chosen as the original and all others the duplicates.
1389    The score is based on the sum of the quality values (<= 15) of the read and its mate (if any).
1390    While single reads are compared to only one read of a pair, the pair will chosen as the original.
1391    The comparison is done on position and orientation, see above for details.
1392 
1393    Marking the supplementary reads of a duplicate as also duplicates takes an extra file read/write
1394    step.  This is because the duplicate can occur before the primary read.*/
1395 
bam_mark_duplicates(md_param_t * param)1396 static int bam_mark_duplicates(md_param_t *param) {
1397     bam_hdr_t *header = NULL;
1398     khiter_t k;
1399     khash_t(reads) *pair_hash        = kh_init(reads);
1400     khash_t(reads) *single_hash      = kh_init(reads);
1401     klist_t(read_queue) *read_buffer = kl_init(read_queue);
1402     kliter_t(read_queue) *rq;
1403     khash_t(duplicates) *dup_hash    = kh_init(duplicates);
1404     int32_t prev_tid;
1405     hts_pos_t prev_coord;
1406     read_queue_t *in_read;
1407     int ret;
1408     long reading, writing, excluded, duplicate, single, pair, single_dup, examined, optical, single_optical;
1409     long np_duplicate, np_opt_duplicate;
1410     long opt_warnings = 0;
1411     tmp_file_t temp;
1412     char *idx_fn = NULL;
1413     int exclude = 0;
1414     check_list_t dup_list = {NULL, 0, 0};
1415 
1416     if (!pair_hash || !single_hash || !read_buffer || !dup_hash) {
1417         fprintf(stderr, "[markdup] out of memory\n");
1418         goto fail;
1419     }
1420 
1421     if ((header = sam_hdr_read(param->in)) == NULL) {
1422         fprintf(stderr, "[markdup] error reading header\n");
1423         goto fail;
1424     }
1425 
1426     // accept unknown, unsorted or coordinate sort order, but error on queryname sorted.
1427     // only really works on coordinate sorted files.
1428     kstring_t str = KS_INITIALIZE;
1429     if (!sam_hdr_find_tag_hd(header, "SO", &str) && str.s && !strcmp(str.s, "queryname")) {
1430         fprintf(stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n");
1431         ks_free(&str);
1432         goto fail;
1433     }
1434     ks_free(&str);
1435 
1436     if (!param->no_pg && sam_hdr_add_pg(header, "samtools", "VN", samtools_version(),
1437                         param->arg_list ? "CL" : NULL,
1438                         param->arg_list ? param->arg_list : NULL,
1439                         NULL) != 0) {
1440         fprintf(stderr, "[markdup] warning: unable to add @PG line to header.\n");
1441     }
1442 
1443     if (sam_hdr_write(param->out, header) < 0) {
1444         fprintf(stderr, "[markdup] error writing header.\n");
1445         goto fail;
1446     }
1447     if (param->write_index) {
1448         if (!(idx_fn = auto_index(param->out, param->out_fn, header)))
1449             goto fail;
1450     }
1451 
1452     // used for coordinate order checks
1453     prev_tid = prev_coord = 0;
1454 
1455     // get the buffer going
1456     in_read = kl_pushp(read_queue, read_buffer);
1457     if (!in_read) {
1458         fprintf(stderr, "[markdup] out of memory\n");
1459         goto fail;
1460     }
1461 
1462     // handling supplementary reads needs a temporary file
1463     if (param->supp) {
1464         if (tmp_file_open_write(&temp, param->prefix, 1)) {
1465             fprintf(stderr, "[markdup] error: unable to open tmp file %s.\n", param->prefix);
1466             goto fail;
1467         }
1468     }
1469 
1470     if ((in_read->b = bam_init1()) == NULL) {
1471         fprintf(stderr, "[markdup] error: unable to allocate memory for alignment.\n");
1472         goto fail;
1473     }
1474 
1475     if (param->check_chain && !(param->tag || param->opt_dist))
1476         param->check_chain = 0;
1477 
1478     if (param->check_chain) {
1479         dup_list.size = 128;
1480         dup_list.c = NULL;
1481 
1482         if ((dup_list.c = malloc(dup_list.size * sizeof(check_t))) == NULL) {
1483             fprintf(stderr, "[markdup] error: unable to allocate memory for dup_list.\n");
1484             goto fail;
1485         }
1486     }
1487 
1488     reading = writing = excluded = single_dup = duplicate = examined = pair = single = optical = single_optical = 0;
1489     np_duplicate = np_opt_duplicate = 0;
1490 
1491     while ((ret = sam_read1(param->in, header, in_read->b)) >= 0) {
1492         int dup_checked = 0;
1493 
1494         // do some basic coordinate order checks
1495         if (in_read->b->core.tid >= 0) { // -1 for unmapped reads
1496             if (in_read->b->core.tid < prev_tid ||
1497                ((in_read->b->core.tid == prev_tid) && (in_read->b->core.pos < prev_coord))) {
1498                 fprintf(stderr, "[markdup] error: not in coordinate sorted order.\n");
1499                 goto fail;
1500             }
1501         }
1502 
1503         prev_coord = in_read->pos = in_read->b->core.pos;
1504         prev_tid   =  in_read->b->core.tid;
1505         in_read->pair_key.single   = 1;
1506         in_read->single_key.single = 0;
1507         in_read->duplicate = NULL;
1508         in_read->dup_checked = 0;
1509 
1510         reading++;
1511 
1512         if (param->clear && (in_read->b->core.flag & BAM_FDUP)) {
1513             uint8_t *data;
1514 
1515             in_read->b->core.flag ^= BAM_FDUP;
1516 
1517             if ((data = bam_aux_get(in_read->b, "dt")) != NULL) {
1518                 bam_aux_del(in_read->b, data);
1519             }
1520 
1521             if ((data = bam_aux_get(in_read->b, "do")) != NULL) {
1522                 bam_aux_del(in_read->b, data);
1523             }
1524         }
1525 
1526         if (param->include_fails) {
1527             exclude |= (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP);
1528         } else {
1529             exclude |= (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP | BAM_FQCFAIL);
1530         }
1531 
1532         // read must not be secondary, supplementary, unmapped or (possibly) failed QC
1533         if (!(in_read->b->core.flag & exclude)) {
1534             examined++;
1535 
1536 
1537             // look at the pairs first
1538             if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) {
1539                 int ret, mate_tmp;
1540                 key_data_t pair_key;
1541                 key_data_t single_key;
1542                 in_hash_t *bp;
1543 
1544                 if (param->mode) {
1545                     if (make_pair_key_sequence(&pair_key, in_read->b)) {
1546                         fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n");
1547                         goto fail;
1548                     }
1549                 } else {
1550                     if (make_pair_key_template(&pair_key, in_read->b)) {
1551                         fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n");
1552                         goto fail;
1553                     }
1554                 }
1555 
1556                 make_single_key(&single_key, in_read->b);
1557 
1558                 pair++;
1559                 in_read->pos = single_key.this_coord; // cigar/orientation modified pos
1560 
1561                 // put in singles hash for checking against non paired reads
1562                 k = kh_put(reads, single_hash, single_key, &ret);
1563 
1564                 if (ret > 0) { // new
1565                     // add to single duplicate hash
1566                     bp = &kh_val(single_hash, k);
1567                     bp->p = in_read;
1568                     in_read->single_key = single_key;
1569                 } else if (ret == 0) { // exists
1570                     // look at singles only for duplication marking
1571                     bp = &kh_val(single_hash, k);
1572 
1573                     if (!(bp->p->b->core.flag & BAM_FPAIRED) || (bp->p->b->core.flag & BAM_FMUNMAP)) {
1574                        // singleton will always be marked duplicate even if
1575                        // scores more than one read of the pair
1576                         bam1_t *dup = bp->p->b;
1577 
1578                         if (param->check_chain)
1579                             in_read->duplicate = bp->p;
1580 
1581                         bp->p = in_read;
1582 
1583                         if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings))
1584                             goto fail;
1585 
1586                         single_dup++;
1587                     }
1588                 } else {
1589                     fprintf(stderr, "[markdup] error: single hashing failure.\n");
1590                     goto fail;
1591                 }
1592 
1593                 // now do the pair
1594                 k = kh_put(reads, pair_hash, pair_key, &ret);
1595 
1596                 if (ret > 0) { // new
1597                     // add to the pair hash
1598                     bp = &kh_val(pair_hash, k);
1599                     bp->p = in_read;
1600                     in_read->pair_key = pair_key;
1601                 } else if (ret == 0) {
1602                     int64_t old_score, new_score, tie_add = 0;
1603                     bam1_t *dup = NULL;
1604 
1605                     bp = &kh_val(pair_hash, k);
1606 
1607                     if ((bp->p->b->core.flag & BAM_FQCFAIL) != (in_read->b->core.flag & BAM_FQCFAIL)) {
1608                         if (bp->p->b->core.flag & BAM_FQCFAIL) {
1609                             old_score = 0;
1610                             new_score = 1;
1611                         } else {
1612                             old_score = 1;
1613                             new_score = 0;
1614                         }
1615                     } else {
1616                         if ((mate_tmp = get_mate_score(bp->p->b)) == -1) {
1617                             fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
1618                             goto fail;
1619                         } else {
1620                             old_score = calc_score(bp->p->b) + mate_tmp;
1621                         }
1622 
1623                         if ((mate_tmp = get_mate_score(in_read->b)) == -1) {
1624                             fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
1625                             goto fail;
1626                         } else {
1627                             new_score = calc_score(in_read->b) + mate_tmp;
1628                         }
1629                     }
1630 
1631                     // choose the highest score as the original
1632                     // and add it to the pair hash, mark the other as duplicate
1633 
1634                     if (new_score == old_score) {
1635                         if (strcmp(bam_get_qname(in_read->b), bam_get_qname(bp->p->b)) < 0) {
1636                             tie_add = 1;
1637                         } else {
1638                             tie_add = -1;
1639                         }
1640                     }
1641 
1642                     if (new_score + tie_add > old_score) { // swap reads
1643                         dup = bp->p->b;
1644 
1645                         if (param->check_chain) {
1646 
1647                             if (in_read->duplicate) {
1648                                 read_queue_t *current = in_read->duplicate;
1649 
1650                                 while (current->duplicate) {
1651                                     current = current->duplicate;
1652                                 }
1653 
1654                                 current->duplicate = bp->p;
1655                             } else {
1656                                 in_read->duplicate = bp->p;
1657                             }
1658                         }
1659 
1660                         bp->p = in_read;
1661                     } else {
1662                         if (param->check_chain) {
1663                             if (bp->p->duplicate) {
1664                                 if (in_read->duplicate) {
1665                                     read_queue_t *current = bp->p->duplicate;
1666 
1667                                     while (current->duplicate) {
1668                                         current = current->duplicate;
1669                                     }
1670 
1671                                     current->duplicate = in_read->duplicate;
1672                                 }
1673 
1674                                 in_read->duplicate = bp->p->duplicate;
1675                             }
1676 
1677                             bp->p->duplicate = in_read;
1678                         }
1679 
1680                         dup = in_read->b;
1681                     }
1682 
1683                     if (mark_duplicates(param, dup_hash, bp->p->b, dup, &optical, &opt_warnings))
1684                         goto fail;
1685 
1686                     duplicate++;
1687                 } else {
1688                     fprintf(stderr, "[markdup] error: pair hashing failure.\n");
1689                     goto fail;
1690                 }
1691             } else { // do the single (or effectively single) reads
1692                 int ret;
1693                 key_data_t single_key;
1694                 in_hash_t *bp;
1695 
1696                 make_single_key(&single_key, in_read->b);
1697 
1698                 single++;
1699                 in_read->pos = single_key.this_coord; // cigar/orientation modified pos
1700 
1701                 k = kh_put(reads, single_hash, single_key, &ret);
1702 
1703                 if (ret > 0) { // new
1704                     bp = &kh_val(single_hash, k);
1705                     bp->p = in_read;
1706                     in_read->single_key = single_key;
1707                 } else if (ret == 0) { // exists
1708                     bp = &kh_val(single_hash, k);
1709 
1710                     if ((bp->p->b->core.flag & BAM_FPAIRED) && !(bp->p->b->core.flag & BAM_FMUNMAP)) {
1711                         // if matched against one of a pair just mark as duplicate
1712 
1713                         if (param->check_chain) {
1714                             if (bp->p->duplicate) {
1715                                 in_read->duplicate = bp->p->duplicate;
1716                             }
1717 
1718                             bp->p->duplicate = in_read;
1719                         }
1720 
1721                         if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings))
1722                             goto fail;
1723 
1724                     } else {
1725                         int64_t old_score, new_score;
1726                         bam1_t *dup = NULL;
1727 
1728                         old_score = calc_score(bp->p->b);
1729                         new_score = calc_score(in_read->b);
1730 
1731                         // choose the highest score as the original, add it
1732                         // to the single hash and mark the other as duplicate
1733                         if (new_score > old_score) { // swap reads
1734                             dup = bp->p->b;
1735 
1736                             if (param->check_chain)
1737                                 in_read->duplicate = bp->p;
1738 
1739                             bp->p = in_read;
1740                         } else {
1741                             if (param->check_chain) {
1742                                 if (bp->p->duplicate) {
1743                                     in_read->duplicate = bp->p->duplicate;
1744                                 }
1745 
1746                                 bp->p->duplicate = in_read;
1747                             }
1748 
1749                             dup = in_read->b;
1750                         }
1751 
1752                         if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings))
1753                             goto fail;
1754                     }
1755 
1756                     single_dup++;
1757                 } else {
1758                     fprintf(stderr, "[markdup] error: single hashing failure.\n");
1759                     goto fail;
1760                 }
1761             }
1762         } else {
1763             excluded++;
1764         }
1765 
1766         // loop through the stored reads and write out those we
1767         // no longer need
1768         rq = kl_begin(read_buffer);
1769         while (rq != kl_end(read_buffer)) {
1770             in_read = &kl_val(rq);
1771 
1772             /* keep a moving window of reads based on coordinates and max read length.  Any unaligned reads
1773                should just be written as they cannot be matched as duplicates. */
1774             if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) {
1775                 break;
1776             }
1777 
1778             if (!dup_checked && param->check_chain) {
1779                 // check for multiple optical duplicates of the same original read
1780 
1781                 if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 1)) {
1782                     fprintf(stderr, "[markdup] error: duplicate checking failed.\n");
1783                     goto fail;
1784                 }
1785 
1786                 dup_checked = 1;
1787             }
1788 
1789 
1790             if (param->check_chain && (in_read->b->core.flag & BAM_FDUP) && !in_read->dup_checked && !(in_read->b->core.flag & exclude)) {
1791                 break;
1792             }
1793 
1794             if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
1795                 if (param->supp) {
1796                     if (tmp_file_write(&temp, in_read->b)) {
1797                         fprintf(stderr, "[markdup] error: writing temp output failed.\n");
1798                         goto fail;
1799                     }
1800                 } else {
1801                     if (sam_write1(param->out, header, in_read->b) < 0) {
1802                         fprintf(stderr, "[markdup] error: writing output failed.\n");
1803                         goto fail;
1804                     }
1805                 }
1806 
1807                 writing++;
1808             }
1809 
1810             // remove from hash
1811             if (in_read->pair_key.single == 0) {
1812                 k = kh_get(reads, pair_hash, in_read->pair_key);
1813                 kh_del(reads, pair_hash, k);
1814             }
1815 
1816             if (in_read->single_key.single == 1) {
1817                 k = kh_get(reads, single_hash, in_read->single_key);
1818                 kh_del(reads, single_hash, k);
1819             }
1820 
1821             kl_shift(read_queue, read_buffer, NULL);
1822             bam_destroy1(in_read->b);
1823             rq = kl_begin(read_buffer);
1824         }
1825 
1826         // set the next one up for reading
1827         in_read = kl_pushp(read_queue, read_buffer);
1828         if (!in_read) {
1829             fprintf(stderr, "[markdup] out of memory\n");
1830             goto fail;
1831         }
1832 
1833         if ((in_read->b = bam_init1()) == NULL) {
1834             fprintf(stderr, "[markdup] error: unable to allocate memory for alignment.\n");
1835             goto fail;
1836         }
1837     }
1838 
1839     if (ret < -1) {
1840         fprintf(stderr, "[markdup] error: truncated input file.\n");
1841         goto fail;
1842     }
1843 
1844     // one last check
1845     if (param->tag || param->opt_dist) {
1846         if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 0)) {
1847             fprintf(stderr, "[markdup] error: duplicate checking failed.\n");
1848             goto fail;
1849         }
1850     }
1851 
1852     // write out the end of the list
1853     rq = kl_begin(read_buffer);
1854     while (rq != kl_end(read_buffer)) {
1855         in_read = &kl_val(rq);
1856 
1857         if (bam_get_qname(in_read->b)) { // last entry will be blank
1858             if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
1859                 if (param->supp) {
1860                     if (tmp_file_write(&temp, in_read->b)) {
1861                         fprintf(stderr, "[markdup] error: writing temp output failed.\n");
1862                         goto fail;
1863                     }
1864                 } else {
1865                     if (sam_write1(param->out, header, in_read->b) < 0) {
1866                         fprintf(stderr, "[markdup] error: writing output failed.\n");
1867                         goto fail;
1868                     }
1869                 }
1870 
1871                 writing++;
1872             }
1873         }
1874 
1875         kl_shift(read_queue, read_buffer, NULL);
1876         bam_destroy1(in_read->b);
1877         rq = kl_begin(read_buffer);
1878     }
1879 
1880     if (param->supp) {
1881         bam1_t *b;
1882 
1883         if (tmp_file_end_write(&temp)) {
1884             fprintf(stderr, "[markdup] error: unable to end tmp writing.\n");
1885             goto fail;
1886         }
1887 
1888         // read data from temp file and mark duplicate supplementary alignments
1889 
1890         if (tmp_file_begin_read(&temp)) {
1891             goto fail;
1892         }
1893 
1894         b = bam_init1();
1895 
1896         while ((ret = tmp_file_read(&temp, b)) > 0) {
1897 
1898             if ((b->core.flag & BAM_FSUPPLEMENTARY) || (b->core.flag & BAM_FUNMAP) || (b->core.flag & BAM_FSECONDARY)) {
1899 
1900                 k = kh_get(duplicates, dup_hash, bam_get_qname(b));
1901 
1902                 if (k != kh_end(dup_hash)) {
1903 
1904                     b->core.flag  |= BAM_FDUP;
1905                     np_duplicate++;
1906 
1907                     if (param->tag && kh_val(dup_hash, k).name) {
1908                         if (bam_aux_update_str(b, "do", strlen(kh_val(dup_hash, k).name) + 1, (char*)kh_val(dup_hash, k).name)) {
1909                             fprintf(stderr, "[markdup] error: unable to append supplementary 'do' tag.\n");
1910                             goto fail;
1911                         }
1912                     }
1913 
1914                     if (param->opt_dist) {
1915                         if (kh_val(dup_hash, k).type) {
1916                             bam_aux_update_str(b, "dt", 3, "SQ");
1917                             np_opt_duplicate++;
1918                         } else {
1919                             bam_aux_update_str(b, "dt", 3, "LB");
1920                         }
1921                     }
1922                 }
1923             }
1924 
1925             if (!param->remove_dups || !(b->core.flag & BAM_FDUP)) {
1926                 if (sam_write1(param->out, header, b) < 0) {
1927                     fprintf(stderr, "[markdup] error: writing final output failed.\n");
1928                     goto fail;
1929                 }
1930             }
1931         }
1932 
1933         if (ret == -1) {
1934             fprintf(stderr, "[markdup] error: failed to read tmp file.\n");
1935             goto fail;
1936         }
1937 
1938         for (k = kh_begin(dup_hash); k != kh_end(dup_hash); ++k) {
1939             if (kh_exist(dup_hash, k)) {
1940                 free(kh_val(dup_hash, k).name);
1941                 free((char *)kh_key(dup_hash, k));
1942                 kh_key(dup_hash, k) = NULL;
1943             }
1944         }
1945 
1946         tmp_file_destroy(&temp);
1947         bam_destroy1(b);
1948     }
1949 
1950     if (opt_warnings) {
1951         fprintf(stderr, "[markdup] warning: number of failed attempts to get coordinates from read names = %ld\n",
1952                         opt_warnings);
1953     }
1954 
1955     if (param->do_stats) {
1956         FILE *fp;
1957         int file_open = 0;
1958         unsigned long els;
1959 
1960         if (param->stats_file) {
1961             if (NULL == (fp = fopen(param->stats_file, "w"))) {
1962                 fprintf(stderr, "[markdup] warning: cannot write stats to %s.\n", param->stats_file);
1963                 fp = stderr;
1964             } else {
1965                 file_open = 1;
1966             }
1967         } else {
1968             fp = stderr;
1969         }
1970 
1971         els = estimate_library_size(pair, duplicate, optical);
1972 
1973         fprintf(fp,
1974                 "COMMAND: %s\n"
1975                 "READ: %ld\n"
1976                 "WRITTEN: %ld\n"
1977                 "EXCLUDED: %ld\n"
1978                 "EXAMINED: %ld\n"
1979                 "PAIRED: %ld\n"
1980                 "SINGLE: %ld\n"
1981                 "DUPLICATE PAIR: %ld\n"
1982                 "DUPLICATE SINGLE: %ld\n"
1983                 "DUPLICATE PAIR OPTICAL: %ld\n"
1984                 "DUPLICATE SINGLE OPTICAL: %ld\n"
1985                 "DUPLICATE NON PRIMARY: %ld\n"
1986                 "DUPLICATE NON PRIMARY OPTICAL: %ld\n"
1987                 "DUPLICATE PRIMARY TOTAL: %ld\n"
1988                 "DUPLICATE TOTAL: %ld\n"
1989                 "ESTIMATED_LIBRARY_SIZE: %ld\n", param->arg_list, reading, writing, excluded, examined, pair, single,
1990                                 duplicate, single_dup, optical, single_optical, np_duplicate, np_opt_duplicate,
1991                                 single_dup + duplicate, single_dup + duplicate + np_duplicate, els);
1992 
1993         if (file_open) {
1994             fclose(fp);
1995         }
1996     }
1997 
1998     if (param->write_index) {
1999         if (sam_idx_save(param->out) < 0) {
2000             print_error_errno("markdup", "writing index failed");
2001             goto fail;
2002         }
2003     }
2004 
2005     if (param->check_chain && (param->tag || param->opt_dist))
2006         free(dup_list.c);
2007 
2008     kh_destroy(reads, pair_hash);
2009     kh_destroy(reads, single_hash);
2010     kl_destroy(read_queue, read_buffer);
2011     kh_destroy(duplicates, dup_hash);
2012     sam_hdr_destroy(header);
2013 
2014     return 0;
2015 
2016  fail:
2017     for (rq = kl_begin(read_buffer); rq != kl_end(read_buffer); rq = kl_next(rq))
2018         bam_destroy1(kl_val(rq).b);
2019     kl_destroy(read_queue, read_buffer);
2020 
2021     for (k = kh_begin(dup_hash); k != kh_end(dup_hash); ++k) {
2022         if (kh_exist(dup_hash, k)) {
2023             free((char *)kh_key(dup_hash, k));
2024         }
2025     }
2026     kh_destroy(duplicates, dup_hash);
2027 
2028     if (param->check_chain && (param->tag || param->opt_dist))
2029         free(dup_list.c);
2030 
2031     kh_destroy(reads, pair_hash);
2032     kh_destroy(reads, single_hash);
2033     sam_hdr_destroy(header);
2034     return 1;
2035 }
2036 
2037 
markdup_usage(void)2038 static int markdup_usage(void) {
2039     fprintf(stderr, "\n");
2040     fprintf(stderr, "Usage:  samtools markdup <input.bam> <output.bam>\n\n");
2041     fprintf(stderr, "Option: \n");
2042     fprintf(stderr, "  -r               Remove duplicate reads\n");
2043     fprintf(stderr, "  -l INT           Max read length (default 300 bases)\n");
2044     fprintf(stderr, "  -S               Mark supplementary alignments of duplicates as duplicates (slower).\n");
2045     fprintf(stderr, "  -s               Report stats.\n");
2046     fprintf(stderr, "  -f NAME          Write stats to named file.  Implies -s.\n");
2047     fprintf(stderr, "  -T PREFIX        Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n");
2048     fprintf(stderr, "  -d INT           Optical distance (if set, marks with dt tag)\n");
2049     fprintf(stderr, "  -c               Clear previous duplicate settings and tags.\n");
2050     fprintf(stderr, "  -m --mode TYPE   Duplicate decision method for paired reads.\n"
2051                     "                   TYPE = t measure positions based on template start/end (default).\n"
2052                     "                          s measure positions based on sequence start.\n");
2053     fprintf(stderr, "  -n               Reduce optical duplicate accuracy (faster results with many duplicates).\n");
2054     fprintf(stderr, "  -u               Output uncompressed data\n");
2055     fprintf(stderr, "  --include-fails  Include quality check failed reads.\n");
2056     fprintf(stderr, "  --no-PG          Do not add a PG line\n");
2057     fprintf(stderr, "  --no-multi-dup   Reduced duplicates of duplicates checking.\n");
2058     fprintf(stderr, "  -t               Mark primary duplicates with the name of the original in a \'do\' tag."
2059                                   " Mainly for information and debugging.\n");
2060 
2061     sam_global_opt_help(stderr, "-.O..@..");
2062 
2063     fprintf(stderr, "\nThe input file must be coordinate sorted and must have gone"
2064                      " through fixmates with the mate scoring option on.\n");
2065 
2066     return 1;
2067 }
2068 
2069 
bam_markdup(int argc,char ** argv)2070 int bam_markdup(int argc, char **argv) {
2071     int c, ret;
2072     char wmode[4] = {'w', 'b', 0, 0};
2073     sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
2074     htsThreadPool p = {NULL, 0};
2075     kstring_t tmpprefix = {0, 0, NULL};
2076     struct stat st;
2077     unsigned int t;
2078     md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NULL, NULL, NULL};
2079 
2080     static const struct option lopts[] = {
2081         SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
2082         {"include-fails", no_argument, NULL, 1001},
2083         {"no-PG", no_argument, NULL, 1002},
2084         {"mode", required_argument, NULL, 'm'},
2085         {"no-multi-dup", no_argument, NULL, 1003},
2086         {NULL, 0, NULL, 0}
2087     };
2088 
2089     while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:cm:u", lopts, NULL)) >= 0) {
2090         switch (c) {
2091             case 'r': param.remove_dups = 1; break;
2092             case 'l': param.max_length = atoi(optarg); break;
2093             case 's': param.do_stats = 1; break;
2094             case 'T': kputs(optarg, &tmpprefix); break;
2095             case 'S': param.supp = 1; break;
2096             case 't': param.tag = 1; break;
2097             case 'f': param.stats_file = optarg; param.do_stats = 1; break;
2098             case 'd': param.opt_dist = atoi(optarg); break;
2099             case 'c': param.clear = 1; break;
2100             case 'm':
2101                 if (strcmp(optarg, "t") == 0) {
2102                     param.mode = MD_MODE_TEMPLATE;
2103                 } else if (strcmp(optarg, "s") == 0) {
2104                     param.mode = MD_MODE_SEQUENCE;
2105                 } else {
2106                     fprintf(stderr, "[markdup] error: unknown mode '%s'.\n", optarg);
2107                     return markdup_usage();
2108                 }
2109 
2110                 break;
2111             case 'u': wmode[2] = '0'; break;
2112             case 1001: param.include_fails = 1; break;
2113             case 1002: param.no_pg = 1; break;
2114             case 1003: param.check_chain = 0; break;
2115             default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
2116             /* else fall-through */
2117             case '?': return markdup_usage();
2118         }
2119     }
2120 
2121     if (optind + 2 > argc)
2122         return markdup_usage();
2123 
2124     if (param.opt_dist < 0) param.opt_dist = 0;
2125     if (param.max_length < 0) param.max_length = 300;
2126 
2127     param.in = sam_open_format(argv[optind], "r", &ga.in);
2128 
2129     if (!param.in) {
2130         print_error_errno("markdup", "failed to open \"%s\" for input", argv[optind]);
2131         return 1;
2132     }
2133 
2134     sam_open_mode(wmode + 1, argv[optind + 1], NULL);
2135     param.out = sam_open_format(argv[optind + 1], wmode, &ga.out);
2136 
2137     if (!param.out) {
2138         print_error_errno("markdup", "failed to open \"%s\" for output", argv[optind + 1]);
2139         return 1;
2140     }
2141 
2142     if (ga.nthreads > 0)  {
2143         if (!(p.pool = hts_tpool_init(ga.nthreads))) {
2144             fprintf(stderr, "[markdup] error creating thread pool\n");
2145             return 1;
2146         }
2147 
2148         hts_set_opt(param.in,  HTS_OPT_THREAD_POOL, &p);
2149         hts_set_opt(param.out, HTS_OPT_THREAD_POOL, &p);
2150     }
2151 
2152     // actual stuff happens here
2153 
2154     // we need temp files so fix up the name here
2155     if (tmpprefix.l == 0) {
2156 
2157         if (strcmp(argv[optind + 1], "-") != 0)
2158             ksprintf(&tmpprefix, "%s.", argv[optind + 1]);
2159         else
2160             kputc('.', &tmpprefix);
2161     }
2162 
2163     if (stat(tmpprefix.s, &st) == 0 && S_ISDIR(st.st_mode)) {
2164         if (tmpprefix.s[tmpprefix.l-1] != '/') kputc('/', &tmpprefix);
2165     }
2166 
2167     t = ((unsigned) time(NULL)) ^ ((unsigned) clock());
2168     ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000);
2169     param.prefix = tmpprefix.s;
2170 
2171     param.arg_list = stringify_argv(argc + 1, argv - 1);
2172     param.write_index = ga.write_index;
2173     param.out_fn = argv[optind + 1];
2174 
2175     ret = bam_mark_duplicates(&param);
2176 
2177     sam_close(param.in);
2178 
2179     if (sam_close(param.out) < 0) {
2180         fprintf(stderr, "[markdup] error closing output file\n");
2181         ret = 1;
2182     }
2183 
2184     if (p.pool) hts_tpool_destroy(p.pool);
2185 
2186     free(param.arg_list);
2187     free(tmpprefix.s);
2188     sam_global_args_free(&ga);
2189 
2190     return ret;
2191 }
2192