1 /* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone
2 through fixmates with the mate scoring option on.
3
4 Copyright (C) 2017-2021 Genome Research Ltd.
5
6 Author: Andrew Whitwham <aw7@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE
25
26 Estimate library size derived from Picard DuplicationMetrics.java
27 Copyright (c) 2009,2018 The Broad Institute. MIT license.
28 */
29
30 #include <config.h>
31
32 #include <stdlib.h>
33 #include <string.h>
34 #include <stdio.h>
35 #include <unistd.h>
36 #include <ctype.h>
37 #include <time.h>
38 #include <sys/stat.h>
39 #include <math.h>
40 #include "htslib/thread_pool.h"
41 #include "htslib/sam.h"
42 #include "sam_opts.h"
43 #include "samtools.h"
44 #include "htslib/khash.h"
45 #include "htslib/klist.h"
46 #include "htslib/kstring.h"
47 #include "tmp_file.h"
48
49
50 typedef struct {
51 samFile *in;
52 samFile *out;
53 char *prefix;
54 int remove_dups;
55 int32_t max_length;
56 int do_stats;
57 int supp;
58 int tag;
59 int opt_dist;
60 int no_pg;
61 int clear;
62 int mode;
63 int write_index;
64 int include_fails;
65 int check_chain;
66 char *stats_file;
67 char *arg_list;
68 char *out_fn;
69 } md_param_t;
70
71 typedef struct {
72 hts_pos_t this_coord;
73 hts_pos_t other_coord;
74 int32_t this_ref;
75 int32_t other_ref;
76 int8_t single;
77 int8_t leftmost;
78 int8_t orientation;
79 } key_data_t;
80
81 typedef struct read_queue_s {
82 key_data_t pair_key;
83 key_data_t single_key;
84 bam1_t *b;
85 struct read_queue_s *duplicate;
86 hts_pos_t pos;
87 int dup_checked;
88 } read_queue_t;
89
90 typedef struct {
91 read_queue_t *p;
92 } in_hash_t;
93
94 typedef struct {
95 char *name;
96 char type;
97 } dup_map_t;
98
99 typedef struct {
100 bam1_t *b;
101 int64_t score;
102 int64_t mate_score;
103 long x;
104 long y;
105 int opt;
106 int xpos;
107 } check_t;
108
109
110 typedef struct {
111 check_t *c;
112 size_t size;
113 size_t length;
114 } check_list_t;
115
116 static khint32_t do_hash(unsigned char *key, khint32_t len);
117
hash_key(key_data_t key)118 static khint_t hash_key(key_data_t key) {
119 int i = 0;
120 khint_t hash;
121
122 if (key.single) {
123 unsigned char sig[13];
124
125 memcpy(sig + i, &key.this_ref, 4); i += 4;
126 memcpy(sig + i, &key.this_coord, 8); i += 8;
127 memcpy(sig + i, &key.orientation, 1); i += 1;
128
129 hash = do_hash(sig, i);
130 } else {
131 unsigned char sig[26];
132
133 memcpy(sig + i, &key.this_ref, 4); i += 4;
134 memcpy(sig + i, &key.this_coord, 8); i += 8;
135 memcpy(sig + i, &key.other_ref, 4); i += 4;
136 memcpy(sig + i, &key.other_coord, 8); i += 8;
137 memcpy(sig + i, &key.leftmost, 1); i += 1;
138 memcpy(sig + i, &key.orientation, 1); i += 1;
139
140 hash = do_hash(sig, i);
141 }
142
143 return hash;
144 }
145
146
key_equal(key_data_t a,key_data_t b)147 static int key_equal(key_data_t a, key_data_t b) {
148 int match = 1;
149
150 if (a.this_coord != b.this_coord)
151 match = 0;
152 else if (a.orientation != b.orientation)
153 match = 0;
154 else if (a.this_ref != b.this_ref)
155 match = 0;
156 else if (a.single != b.single)
157 match = 0;
158
159 if (!a.single) {
160 if (a.other_coord != b.other_coord)
161 match = 0;
162 else if (a.leftmost != b.leftmost)
163 match = 0;
164 else if (a.other_ref != b.other_ref)
165 match = 0;
166 }
167
168 return match;
169 }
170
171
172 #define __free_queue_element(p)
173
174 // Orientations (prime numbers to feed to hashing algorithm)
175 #define O_FF 2
176 #define O_RR 3
177 #define O_FR 5
178 #define O_RF 7
179
180 // Left or rightmost
181 #define R_LE 11
182 #define R_RI 13
183
184 #define BMD_WARNING_MAX 10
185
186 #define MD_MIN_QUALITY 15
187
188 // Duplicate finding mode
189 #define MD_MODE_TEMPLATE 0
190 #define MD_MODE_SEQUENCE 1
191
192 KHASH_INIT(reads, key_data_t, in_hash_t, 1, hash_key, key_equal) // read map hash
KLIST_INIT(read_queue,read_queue_t,__free_queue_element)193 KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer
194 KHASH_MAP_INIT_STR(duplicates, dup_map_t) // map of duplicates for supplementary dup id
195
196
197 /* Calculate the mate's unclipped start based on position and cigar string from MC tag. */
198
199 static hts_pos_t unclipped_other_start(hts_pos_t op, char *cigar) {
200 char *c = cigar;
201 int64_t clipped = 0;
202
203 while (*c && *c != '*') {
204 long num = 0;
205
206 if (isdigit((int)*c)) {
207 num = strtol(c, &c, 10);
208 } else {
209 num = 1;
210 }
211
212 if (*c == 'S' || *c == 'H') { // clips
213 clipped += num;
214 } else {
215 break;
216 }
217
218 c++;
219 }
220
221 return op - clipped + 1;
222 }
223
224
225 /* Calculate the current read's start based on the stored cigar string. */
226
unclipped_start(bam1_t * b)227 static hts_pos_t unclipped_start(bam1_t *b) {
228 uint32_t *cigar = bam_get_cigar(b);
229 int64_t clipped = 0;
230 uint32_t i;
231
232 for (i = 0; i < b->core.n_cigar; i++) {
233 char c = bam_cigar_opchr(cigar[i]);
234
235 if (c == 'S' || c == 'H') { // clips
236 clipped += bam_cigar_oplen(cigar[i]);
237 } else {
238 break;
239 }
240 }
241
242 return b->core.pos - clipped + 1;
243 }
244
245
246 /* Calculate the mate's unclipped end based on start position and cigar string from MC tag.*/
247
unclipped_other_end(int64_t op,char * cigar)248 static hts_pos_t unclipped_other_end(int64_t op, char *cigar) {
249 char *c = cigar;
250 int64_t refpos = 0;
251 int skip = 1;
252
253 while (*c && *c != '*') {
254 long num = 0;
255
256 if (isdigit((int)*c)) {
257 num = strtol(c, &c, 10);
258 } else {
259 num = 1;
260 }
261
262 switch (*c) {
263 case 'M':
264 case 'D':
265 case 'N':
266 case '=':
267 case 'X':
268 refpos += num;
269 skip = 0; // ignore initial clips
270 break;
271
272 case 'S':
273 case 'H':
274 if (!skip) {
275 refpos += num;
276 }
277 break;
278 }
279
280 c++;
281 }
282
283 return op + refpos;
284 }
285
286
287 /* Calculate the current read's end based on the stored cigar string. */
288
unclipped_end(bam1_t * b)289 static hts_pos_t unclipped_end(bam1_t *b) {
290 uint32_t *cigar = bam_get_cigar(b);
291 hts_pos_t end_pos, clipped = 0;
292 int32_t i;
293
294 end_pos = bam_endpos(b);
295
296 // now get the clipped end bases (if any)
297 // if we get to the beginning of the cigar string
298 // without hitting a non-clip then the results are meaningless
299 for (i = b->core.n_cigar - 1; i >= 0; i--) {
300 char c = bam_cigar_opchr(cigar[i]);
301
302 if (c == 'S' || c == 'H') { // clips
303 clipped += bam_cigar_oplen(cigar[i]);
304 } else {
305 break;
306 }
307 }
308
309 return end_pos + clipped;
310 }
311
312
313 /* The Bob Jenkins one_at_a_time hash to reduce the key to a 32 bit value. */
314
do_hash(unsigned char * key,khint32_t len)315 static khint32_t do_hash(unsigned char *key, khint32_t len) {
316 khint32_t hash, i;
317
318 for (hash = 0, i = 0; i < len; ++i) {
319 hash += key[i];
320 hash += (hash << 10);
321 hash ^= (hash >> 6);
322 }
323
324 hash += (hash << 3);
325 hash ^= (hash >> 11);
326 hash += (hash << 15);
327
328 return hash;
329 }
330
331
332 /* Get mate score from tag. */
333
get_mate_score(bam1_t * b)334 static int64_t get_mate_score(bam1_t *b) {
335 uint8_t *data;
336 int64_t score;
337
338 if ((data = bam_aux_get(b, "ms"))) {
339 score = bam_aux2i(data);
340 } else {
341 fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
342 return -1;
343 }
344
345 return score;
346 }
347
348
349 /* Calc current score from quality. */
350
calc_score(bam1_t * b)351 static int64_t calc_score(bam1_t *b)
352 {
353 int64_t score = 0;
354 uint8_t *qual = bam_get_qual(b);
355 int i;
356
357 for (i = 0; i < b->core.l_qseq; i++) {
358 if (qual[i] >= MD_MIN_QUALITY) score += qual[i];
359 }
360
361 return score;
362 }
363
364
365 /* Create a signature hash of the current read and its pair.
366 Uses the unclipped start (or end depending on orientation),
367 the reference id, orientation and whether the current
368 read is leftmost of the pair. */
369
make_pair_key_template(key_data_t * key,bam1_t * bam)370 static int make_pair_key_template(key_data_t *key, bam1_t *bam) {
371 hts_pos_t this_coord, other_coord, this_end, other_end;
372 int32_t this_ref, other_ref;
373 int8_t orientation, leftmost;
374 uint8_t *data;
375 char *cig;
376
377 this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash
378 other_ref = bam->core.mtid + 1;
379
380 this_coord = unclipped_start(bam);
381 this_end = unclipped_end(bam);
382
383 if ((data = bam_aux_get(bam, "MC"))) {
384 if (!(cig = bam_aux2Z(data))) {
385 fprintf(stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n");
386 return 1;
387 }
388
389 other_end = unclipped_other_end(bam->core.mpos, cig);
390 other_coord = unclipped_other_start(bam->core.mpos, cig);
391 } else {
392 fprintf(stderr, "[markdup] error: no MC tag. Please run samtools fixmate on file first.\n");
393 return 1;
394 }
395
396 // work out orientations
397 if (this_ref != other_ref) {
398 leftmost = this_ref < other_ref;
399 } else {
400 if (bam_is_rev(bam) == bam_is_mrev(bam)) {
401 if (!bam_is_rev(bam)) {
402 leftmost = this_coord <= other_coord;
403 } else {
404 leftmost = this_end <= other_end;
405 }
406 } else {
407 if (bam_is_rev(bam)) {
408 leftmost = this_end <= other_coord;
409 } else {
410 leftmost = this_coord <= other_end;
411 }
412 }
413 }
414
415 // pair orientation
416 if (leftmost) {
417 if (bam_is_rev(bam) == bam_is_mrev(bam)) {
418 other_coord = other_end;
419
420 if (!bam_is_rev(bam)) {
421 if (bam->core.flag & BAM_FREAD1) {
422 orientation = O_FF;
423 } else {
424 orientation = O_RR;
425 }
426 } else {
427 if (bam->core.flag & BAM_FREAD1) {
428 orientation = O_RR;
429 } else {
430 orientation = O_FF;
431 }
432 }
433 } else {
434 if (!bam_is_rev(bam)) {
435 orientation = O_FR;
436 other_coord = other_end;
437 } else {
438 orientation = O_RF;
439 this_coord = this_end;
440 }
441 }
442 } else {
443 if (bam_is_rev(bam) == bam_is_mrev(bam)) {
444 this_coord = this_end;
445
446 if (!bam_is_rev(bam)) {
447 if (bam->core.flag & BAM_FREAD1) {
448 orientation = O_RR;
449 } else {
450 orientation = O_FF;
451 }
452 } else {
453 if (bam->core.flag & BAM_FREAD1) {
454 orientation = O_FF;
455 } else {
456 orientation = O_RR;
457 }
458 }
459 } else {
460 if (!bam_is_rev(bam)) {
461 orientation = O_RF;
462 other_coord = other_end;
463 } else {
464 orientation = O_FR;
465 this_coord = this_end;
466 }
467 }
468 }
469
470 if (!leftmost)
471 leftmost = R_RI;
472 else
473 leftmost = R_LE;
474
475 key->single = 0;
476 key->this_ref = this_ref;
477 key->this_coord = this_coord;
478 key->other_ref = other_ref;
479 key->other_coord = other_coord;
480 key->leftmost = leftmost;
481 key->orientation = orientation;
482
483 return 0;
484 }
485
486
make_pair_key_sequence(key_data_t * key,bam1_t * bam)487 static int make_pair_key_sequence(key_data_t *key, bam1_t *bam) {
488 hts_pos_t this_coord, this_end, other_coord, other_end, leftmost;
489 int32_t this_ref, other_ref;
490 int8_t orientation, left_read;
491 uint8_t *data;
492 char *cig;
493
494 this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash
495 other_ref = bam->core.mtid + 1;
496
497 this_coord = unclipped_start(bam);
498 this_end = unclipped_end(bam);
499
500 if ((data = bam_aux_get(bam, "MC"))) {
501 if (!(cig = bam_aux2Z(data))) {
502 fprintf(stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n");
503 return 1;
504 }
505
506 other_end = unclipped_other_end(bam->core.mpos, cig);
507 other_coord = unclipped_other_start(bam->core.mpos, cig);
508 } else {
509 fprintf(stderr, "[markdup] error: no MC tag. Please run samtools fixmate on file first.\n");
510 return 1;
511 }
512
513 // work out orientations
514 if (this_ref != other_ref) {
515 leftmost = this_ref - other_ref;
516 } else {
517 if (bam_is_rev(bam) == bam_is_mrev(bam)) {
518 if (!bam_is_rev(bam)) {
519 leftmost = this_coord - other_coord;
520 } else {
521 leftmost = this_end - other_end;
522 }
523 } else {
524 if (bam_is_rev(bam)) {
525 leftmost = this_end - other_coord;
526 } else {
527 leftmost = this_coord - other_end;
528 }
529 }
530 }
531
532 if (leftmost < 0) {
533 leftmost = 1;
534 } else if (leftmost > 0) {
535 leftmost = 0;
536 } else {
537 // tie breaks
538
539 if (bam->core.pos == bam->core.mpos) {
540 if (bam->core.flag & BAM_FREAD1) {
541 leftmost = 1;
542 } else {
543 leftmost = 0;
544 }
545 } else if (bam->core.pos < bam->core.mpos) {
546 leftmost = 1;
547 } else {
548 leftmost = 0;
549 }
550 }
551
552 // pair orientation
553 if (leftmost) {
554 if (bam_is_rev(bam) == bam_is_mrev(bam)) {
555
556 if (!bam_is_rev(bam)) {
557 orientation = O_FF;
558 } else {
559 orientation = O_RR;
560 }
561 } else {
562 if (!bam_is_rev(bam)) {
563 orientation = O_FR;
564 } else {
565 orientation = O_RF;
566 }
567 }
568 } else {
569 if (bam_is_rev(bam) == bam_is_mrev(bam)) {
570
571 if (!bam_is_rev(bam)) {
572 orientation = O_RR;
573 } else {
574 orientation = O_FF;
575 }
576 } else {
577 if (!bam_is_rev(bam)) {
578 orientation = O_RF;
579 } else {
580 orientation = O_FR;
581 }
582 }
583 }
584
585 if (!leftmost)
586 left_read = R_RI;
587 else
588 left_read = R_LE;
589
590 if (!bam_is_rev(bam)) {
591 this_coord = unclipped_start(bam);
592 } else {
593 this_coord = unclipped_end(bam);
594 }
595
596 if (!bam_is_mrev(bam)) {
597 other_coord = unclipped_other_start(bam->core.mpos, cig);
598 } else {
599 other_coord = unclipped_other_end(bam->core.mpos, cig);
600 }
601
602 key->single = 0;
603 key->this_ref = this_ref;
604 key->this_coord = this_coord;
605 key->other_ref = other_ref;
606 key->other_coord = other_coord;
607 key->leftmost = left_read;
608 key->orientation = orientation;
609
610 return 0;
611 }
612
613 /* Create a signature hash of single read (or read with an unmatched pair).
614 Uses unclipped start (or end depending on orientation), reference id,
615 and orientation. */
616
make_single_key(key_data_t * key,bam1_t * bam)617 static void make_single_key(key_data_t *key, bam1_t *bam) {
618 hts_pos_t this_coord;
619 int32_t this_ref;
620 int8_t orientation;
621
622 this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash
623
624 if (bam_is_rev(bam)) {
625 this_coord = unclipped_end(bam);
626 orientation = O_RR;
627 } else {
628 this_coord = unclipped_start(bam);
629 orientation = O_FF;
630 }
631
632 key->single = 1;
633 key->this_ref = this_ref;
634 key->this_coord = this_coord;
635 key->orientation = orientation;
636 }
637
638
639 /* Add the duplicate name to a hash if it does not exist. */
640
add_duplicate(khash_t (duplicates)* d_hash,bam1_t * dupe,char * orig_name,char type)641 static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_name, char type) {
642 khiter_t d;
643 int ret;
644
645 d = kh_get(duplicates, d_hash, bam_get_qname(dupe));
646
647 if (d == kh_end(d_hash)) {
648 char *name = strdup(bam_get_qname(dupe));
649 if (name) {
650 d = kh_put(duplicates, d_hash, name, &ret);
651 } else {
652 ret = -1;
653 }
654
655 if (ret >= 0) {
656 if (orig_name) {
657 if (ret == 0) {
658 // replace old name
659 free(kh_value(d_hash, d).name);
660 free(name);
661 }
662
663 kh_value(d_hash, d).name = strdup(orig_name);
664
665 if (kh_value(d_hash, d).name == NULL) {
666 fprintf(stderr, "[markdup] error: unable to allocate memory for duplicate original name.\n");
667 return 1;
668 }
669 } else {
670 kh_value(d_hash, d).name = NULL;
671 }
672
673 kh_value(d_hash, d).type = type;
674 } else {
675 fprintf(stderr, "[markdup] error: unable to store supplementary duplicates.\n");
676 free(name);
677 return 1;
678 }
679 }
680
681 return 0;
682 }
683
684
685 /* Get the position of the coordinates from the read name. */
get_coordinate_positions(const char * qname,int * xpos,int * ypos)686 static inline int get_coordinate_positions(const char *qname, int *xpos, int *ypos) {
687 int sep = 0;
688 int pos = 0;
689
690 while (qname[pos]) {
691 if (qname[pos] == ':') {
692 sep++;
693
694 if (sep == 2) {
695 *xpos = pos + 1;
696 } else if (sep == 3) {
697 *ypos = pos + 1;
698 } else if (sep == 4) { // HiSeq style names
699 *xpos = *ypos;
700 *ypos = pos + 1;
701 } else if (sep == 5) { // Newer Illumina format
702 *xpos = pos + 1;
703 } else if (sep == 6) {
704 *ypos = pos + 1;
705 }
706 }
707
708 pos++;
709 }
710
711 return sep;
712 }
713
714
get_coordinates(const char * name,int * xpos_out,long * x_coord,long * y_coord,long * warnings)715 static int get_coordinates(const char *name, int *xpos_out, long *x_coord, long *y_coord, long *warnings) {
716 int ret = 1;
717 int seps, xpos = 0, ypos = 0;
718 long x = 0, y = 0;
719 char *end;
720
721 seps = get_coordinate_positions(name, &xpos, &ypos);
722
723 /* The most current Illumina read format at time of writing is:
724 @machine:run:flowcell:lane:tile:x:y:UMI or
725 @machine:run:flowcell:lane:tile:x:y
726
727 Counting the separating colons gives us a quick format check.
728 Older name formats have fewer elements.
729 */
730
731 if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) {
732 (*warnings)++;
733
734 if (*warnings <= BMD_WARNING_MAX) {
735 fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", name);
736 }
737
738 return ret;
739 }
740
741 x = strtol(name + xpos, &end, 10);
742
743 if ((name + xpos) == end) {
744 (*warnings)++;
745
746 if (*warnings <= BMD_WARNING_MAX) {
747 fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", name);
748 }
749
750 return ret;
751 }
752
753 y = strtol(name + ypos, &end, 10);
754
755 if ((name + ypos) == end) {
756 (*warnings)++;
757
758 if (*warnings <= BMD_WARNING_MAX) {
759 fprintf(stderr, "[markdup] warning: can not decipher y coordinate in %s .\n", name);
760 }
761
762 return ret;
763 }
764
765 *x_coord = x;
766 *y_coord = y;
767 *xpos_out = xpos;
768 ret = 0;
769
770 return ret;
771 }
772
773
774 /* Using the coordinates from the Illumina read name, see whether the duplicated read is
775 close enough (set by max_dist) to the original to be counted as optical.*/
776
optical_duplicate(bam1_t * ori,bam1_t * dup,long max_dist,long * warnings)777 static int optical_duplicate(bam1_t *ori, bam1_t *dup, long max_dist, long *warnings) {
778 int ret = 0, seps;
779 char *original, *duplicate;
780 int oxpos = 0, oypos = 0, dxpos = 0, dypos = 0;
781
782
783 original = bam_get_qname(ori);
784 duplicate = bam_get_qname(dup);
785
786 seps = get_coordinate_positions(original, &oxpos, &oypos);
787
788 if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) {
789 (*warnings)++;
790
791 if (*warnings <= BMD_WARNING_MAX) {
792 fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", original);
793 }
794
795 return ret;
796 }
797
798 seps = get_coordinate_positions(duplicate, &dxpos, &dypos);
799
800 if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) {
801
802 (*warnings)++;
803
804 if (*warnings <= BMD_WARNING_MAX) {
805 fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", duplicate);
806 }
807
808 return ret;
809 }
810
811 if (strncmp(original, duplicate, oxpos - 1) == 0) {
812 // the initial parts match, look at the numbers
813 long ox, oy, dx, dy, xdiff, ydiff;
814 char *end;
815
816 ox = strtol(original + oxpos, &end, 10);
817
818 if ((original + oxpos) == end) {
819 (*warnings)++;
820
821 if (*warnings <= BMD_WARNING_MAX) {
822 fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", original);
823 }
824
825 return ret;
826 }
827
828 dx = strtol(duplicate + dxpos, &end, 10);
829
830 if ((duplicate + dxpos) == end) {
831 (*warnings)++;
832
833 if (*warnings <= BMD_WARNING_MAX) {
834 fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s.\n", duplicate);
835 }
836
837 return ret;
838 }
839
840 if (ox > dx) {
841 xdiff = ox - dx;
842 } else {
843 xdiff = dx - ox;
844 }
845
846 if (xdiff <= max_dist) {
847 // still might be optical
848
849 oy = strtol(original + oypos, &end, 10);
850
851 if ((original + oypos) == end) {
852 (*warnings)++;
853
854 if (*warnings <= BMD_WARNING_MAX) {
855 fprintf(stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", original);
856 }
857
858 return ret;
859 }
860
861 dy = strtol(duplicate + dypos, &end, 10);
862
863 if ((duplicate + dypos) == end) {
864 (*warnings)++;
865
866 if (*warnings <= BMD_WARNING_MAX) {
867 fprintf(stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", duplicate);
868 }
869
870 return ret;
871 }
872
873 if (oy > dy) {
874 ydiff = oy - dy;
875 } else {
876 ydiff = dy - oy;
877 }
878
879 if (ydiff <= max_dist) ret = 1;
880 }
881 }
882
883 return ret;
884 }
885
886
887 /* Using the coordinates from the Illumina read name, see whether the duplicated read is
888 close enough (set by max_dist) to the original to be counted as optical.
889
890 This function needs the values from the first read to be already calculated. */
891
optical_duplicate_partial(const char * name,const int oxpos,const long ox,const long oy,bam1_t * dup,check_t * c,long max_dist,long * warnings)892 static int optical_duplicate_partial(const char *name, const int oxpos, const long ox, const long oy, bam1_t *dup, check_t *c, long max_dist, long *warnings) {
893 int ret = 0;
894 char *duplicate;
895 int dxpos = 0;
896 long dx, dy;
897
898 duplicate = bam_get_qname(dup);
899
900 if (get_coordinates(duplicate, &dxpos, &dx, &dy, warnings)) {
901 return ret;
902 }
903
904 if (strncmp(name, duplicate, oxpos - 1) == 0) {
905 // the initial parts match, look at the numbers
906 long xdiff, ydiff;
907
908 if (ox > dx) {
909 xdiff = ox - dx;
910 } else {
911 xdiff = dx - ox;
912 }
913
914 if (xdiff <= max_dist) {
915 // still might be optical
916
917 if (oy > dy) {
918 ydiff = oy - dy;
919 } else {
920 ydiff = dy - oy;
921 }
922
923 if (ydiff <= max_dist) ret = 1;
924 }
925 }
926
927 c->x = dx;
928 c->y = dy;
929 c->xpos = dxpos;
930
931 return ret;
932 }
933
934
935 /* Mark the read as a duplicate and update the duplicate hash (if needed) */
mark_duplicates(md_param_t * param,khash_t (duplicates)* dup_hash,bam1_t * ori,bam1_t * dup,long * optical,long * warn)936 static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *ori, bam1_t *dup,
937 long *optical, long *warn) {
938 char dup_type = 0;
939 long incoming_warnings = *warn;
940
941 dup->core.flag |= BAM_FDUP;
942
943 if (param->tag) {
944 if (bam_aux_update_str(dup, "do", strlen(bam_get_qname(ori)) + 1, bam_get_qname(ori))) {
945 fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n");
946 return -1;
947 }
948 }
949
950 if (param->opt_dist) { // mark optical duplicates
951 if (optical_duplicate(ori, dup, param->opt_dist, warn)) {
952 bam_aux_update_str(dup, "dt", 3, "SQ");
953 dup_type = 'O';
954 (*optical)++;
955 } else {
956 // not an optical duplicate
957 bam_aux_update_str(dup, "dt", 3, "LB");
958 }
959 }
960
961 if ((*warn == BMD_WARNING_MAX) && (incoming_warnings != *warn)) {
962 fprintf(stderr, "[markdup] warning: %ld decipher read name warnings. New warnings will not be reported.\n",
963 *warn);
964 }
965
966 if (param->supp) {
967 if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP) || bam_aux_get(dup, "XA")) {
968 char *original = NULL;
969
970 if (param->tag) {
971 original = bam_get_qname(ori);
972 }
973
974 if (add_duplicate(dup_hash, dup, original, dup_type))
975 return -1;
976 }
977 }
978
979 return 0;
980 }
981
982
983 /* If the duplicate type has changed to optical then retag and duplicate hash. */
optical_retag(md_param_t * param,khash_t (duplicates)* dup_hash,bam1_t * b,int paired,long * optical_single,long * optical_pair)984 static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, long *optical_single, long *optical_pair) {
985 int ret = 0;
986
987 if (bam_aux_update_str(b, "dt", 3, "SQ")) {
988 fprintf(stderr, "[markdup] error: unable to update 'dt' tag.\n");
989 ret = -1;
990 }
991
992 if (paired) {
993 (*optical_pair)++;
994 } else {
995 (*optical_single)++;
996 }
997
998 if (param->supp) {
999 // Change the duplicate type
1000
1001 if (bam_aux_get(b, "SA") || (b->core.flag & BAM_FMUNMAP)
1002 || bam_aux_get(b, "XA")) {
1003 khiter_t d;
1004
1005 d = kh_get(duplicates, dup_hash, bam_get_qname(b));
1006
1007 if (d == kh_end(dup_hash)) {
1008 // error, name should already be in dup hash
1009 fprintf(stderr, "[markdup] error: duplicate name %s not found in hash.\n",
1010 bam_get_qname(b));
1011 ret = -1;
1012 } else {
1013 kh_value(dup_hash, d).type = 'O';
1014 }
1015 }
1016 }
1017
1018 return ret;
1019 }
1020
1021
1022 /* Check all duplicates of the highest quality read (the "original") for consistancy. Also
1023 pre-calculate any values for use in check_duplicate_chain later.
1024 Returns 0 on success, >0 on coordinate reading error (program can continue) or
1025 <0 on an error (program should not continue. */
check_chain_against_original(md_param_t * param,khash_t (duplicates)* dup_hash,read_queue_t * ori,check_list_t * list,long * warn,long * optical_single,long * optical_pair)1026 static int check_chain_against_original(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori,
1027 check_list_t *list, long *warn, long *optical_single, long *optical_pair) {
1028
1029 int ret = 0;
1030 char *ori_name = bam_get_qname(ori->b);
1031 read_queue_t *current = ori->duplicate;
1032 int xpos;
1033 long x, y;
1034
1035 if (param->opt_dist) {
1036 if ((ret = get_coordinates(ori_name, &xpos, &x, &y, warn))) {
1037 return ret;
1038 }
1039 }
1040
1041 list->length = 0;
1042
1043 while (current) {
1044 check_t *c;
1045
1046 if (list->length >= list->size) {
1047 check_t *tmp;
1048
1049 list->size *= 2;
1050
1051 if (!(tmp = realloc(list->c, list->size * sizeof(check_t)))) {
1052 fprintf(stderr, "[markdup] error: Unable to expand opt check list.\n");
1053 return -1;
1054 }
1055
1056 list->c = tmp;
1057 }
1058
1059 c = &list->c[list->length];
1060
1061 c->b = current->b;
1062 c->x = -1;
1063 c->y = -1;
1064 c->opt = 0;
1065 c->score = 0;
1066 c->mate_score = 0;
1067 current->dup_checked = 1;
1068
1069 if (param->tag) {
1070 uint8_t *data;
1071
1072 // at this stage all duplicates should have a do tag
1073 if ((data = bam_aux_get(current->b, "do")) != NULL) {
1074 // see if we need to change the tag
1075 char *old_name = bam_aux2Z(data);
1076
1077 if (old_name) {
1078 if (strcmp(old_name, ori_name) != 0) {
1079 if (bam_aux_update_str(current->b, "do", strlen(ori_name) + 1, (const char *)ori_name)) {
1080 fprintf(stderr, "[markdup] error: unable to update 'do' tag.\n");
1081 ret = -1;
1082 break;
1083 }
1084 }
1085 } else {
1086 fprintf(stderr, "[markdup] error: 'do' tag has wrong type for read %s.\n", bam_get_qname(current->b));
1087 ret = -1;
1088 break;
1089 }
1090 }
1091 }
1092
1093 if (param->opt_dist) {
1094 uint8_t *data;
1095 char *dup_type;
1096 int is_opt = 0;
1097 int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
1098
1099 if ((data = bam_aux_get(current->b, "dt"))) {
1100 if ((dup_type = bam_aux2Z(data))) {
1101 if (strcmp(dup_type, "SQ") == 0) {
1102 c->opt = 1;
1103 }
1104 }
1105 }
1106
1107 // need to run this to get the duplicates x and y scores
1108 is_opt = optical_duplicate_partial(ori_name, xpos, x, y, current->b, c, param->opt_dist, warn);
1109
1110 if (!c->opt && is_opt) {
1111 if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
1112 ret = -1;
1113 break;
1114 }
1115
1116 c->opt = 1;
1117 }
1118
1119 c->score = calc_score(current->b);
1120
1121 if (current_paired) {
1122 if ((c->mate_score = get_mate_score(current->b)) == -1) {
1123 fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
1124 ret = -1;
1125 break;
1126 }
1127 }
1128 }
1129
1130 current = current->duplicate;
1131 list->length++;
1132 }
1133
1134 return ret;
1135 }
1136
1137
xcoord_sort(const void * a,const void * b)1138 static int xcoord_sort(const void *a, const void *b) {
1139 check_t *ac = (check_t *) a;
1140 check_t *bc = (check_t *) b;
1141
1142 return (ac->x - bc->x);
1143 }
1144
1145
1146 /* Check all the duplicates against each other to see if they are optical duplicates. */
check_duplicate_chain(md_param_t * param,khash_t (duplicates)* dup_hash,check_list_t * list,long * warn,long * optical_single,long * optical_pair)1147 static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_hash, check_list_t *list,
1148 long *warn, long *optical_single, long *optical_pair) {
1149 int ret = 0;
1150 size_t curr = 0;
1151
1152 qsort(list->c, list->length, sizeof(list->c[0]), xcoord_sort);
1153
1154 while (curr < list->length - 1) {
1155 check_t *current = &list->c[curr];
1156 size_t count = curr;
1157 char *cur_name = bam_get_qname(current->b);
1158 int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
1159
1160 while (++count < list->length && (list->c[count].x - current->x <= param->opt_dist)) {
1161 // while close enough along the x coordinate
1162 check_t *chk = &list->c[count];
1163
1164 if (current->opt && chk->opt)
1165 continue;
1166
1167 // if both are already optical duplicates there is no need to check again, otherwise...
1168
1169 long ydiff;
1170
1171 if (current->y > chk->y) {
1172 ydiff = current->y - chk->y;
1173 } else {
1174 ydiff = chk->y - current->y;
1175 }
1176
1177 if (ydiff > param->opt_dist)
1178 continue;
1179
1180 // the number are right, check the names
1181 if (strncmp(cur_name, bam_get_qname(chk->b), current->xpos - 1) != 0)
1182 continue;
1183
1184 // optical duplicates
1185 int chk_dup = 0;
1186 int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP);
1187
1188 if (current_paired != chk_paired) {
1189 if (!chk_paired) {
1190 // chk is single vs pair, this is a dup.
1191 chk_dup = 1;
1192 }
1193 } else {
1194 // do it by scores
1195 int64_t cur_score, chk_score;
1196
1197 if ((current->b->core.flag & BAM_FQCFAIL) != (chk->b->core.flag & BAM_FQCFAIL)) {
1198 if (current->b->core.flag & BAM_FQCFAIL) {
1199 cur_score = 0;
1200 chk_score = 1;
1201 } else {
1202 cur_score = 1;
1203 chk_score = 0;
1204 }
1205 } else {
1206 cur_score = current->score;
1207 chk_score = chk->score;
1208
1209 if (current_paired) {
1210 // they are pairs so add mate scores.
1211 chk_score += chk->mate_score;
1212 cur_score += current->mate_score;
1213 }
1214 }
1215
1216 if (cur_score == chk_score) {
1217 if (strcmp(bam_get_qname(chk->b), cur_name) < 0) {
1218 chk_score++;
1219 } else {
1220 chk_score--;
1221 }
1222 }
1223
1224 if (cur_score > chk_score) {
1225 chk_dup = 1;
1226 }
1227 }
1228
1229 if (chk_dup) {
1230 // the duplicate is the optical duplicate
1231 if (!chk->opt) { // only change if not already an optical duplicate
1232 if (optical_retag(param, dup_hash, chk->b, chk_paired, optical_single, optical_pair)) {
1233 ret = -1;
1234 goto fail;
1235 }
1236
1237 chk->opt = 1;
1238 }
1239 } else {
1240 if (!current->opt) {
1241 if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
1242 ret = -1;
1243 goto fail;
1244 }
1245
1246 current->opt = 1;
1247 }
1248 }
1249 }
1250
1251 curr++;
1252 }
1253
1254 fail:
1255 return ret;
1256 }
1257
1258
1259 /* Where there is more than one duplicate go down the list and check for optical duplicates and change
1260 do tags (where used) to point to original (non-duplicate) read. */
find_duplicate_chains(md_param_t * param,klist_t (read_queue)* read_buffer,khash_t (duplicates)* dup_hash,check_list_t * dup_list,const hts_pos_t prev_coord,const int32_t prev_tid,long * warn,long * optical_single,long * optical_pair,const int check_range)1261 static int find_duplicate_chains(md_param_t *param, klist_t(read_queue) *read_buffer, khash_t(duplicates) *dup_hash, check_list_t *dup_list,
1262 const hts_pos_t prev_coord, const int32_t prev_tid, long *warn, long *optical_single,
1263 long *optical_pair, const int check_range) {
1264 int ret = 0;
1265 kliter_t(read_queue) *rq;
1266
1267 rq = kl_begin(read_buffer);
1268
1269 while (rq != kl_end(read_buffer)) {
1270 read_queue_t *in_read = &kl_val(rq);
1271
1272 if (check_range) {
1273 /* Just check against the moving window of reads based on coordinates and max read length. */
1274 if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) {
1275 break;
1276 }
1277 } else {
1278 // this is the last set of results and the end entry will be blank
1279 if (!bam_get_qname(in_read->b)) {
1280 break;
1281 }
1282 }
1283
1284 if (!(in_read->b->core.flag & BAM_FDUP) && in_read->duplicate) { // is the head of a duplicate chain
1285
1286 // check against the original for tagging and optical duplication
1287 if ((ret = check_chain_against_original(param, dup_hash, in_read, dup_list, warn, optical_single, optical_pair))) {
1288 if (ret < 0) { // real error
1289 ret = -1;
1290 break;
1291 } else { // coordinate decoding error
1292 ret = 0;
1293 in_read->duplicate = NULL;
1294 continue;
1295 }
1296 }
1297
1298 // check the rest of the duplicates against each other for optical duplication
1299 if (param->opt_dist && check_duplicate_chain(param, dup_hash, dup_list, warn, optical_single, optical_pair)) {
1300 ret = -1;
1301 break;
1302 }
1303
1304 in_read->duplicate = NULL;
1305 }
1306
1307 rq = kl_next(rq);
1308 }
1309
1310 return ret;
1311 }
1312
1313
1314 /*
1315 Function to use when estimating library size.
1316
1317 This is based on an approximate formula for the coverage of a set
1318 obtained after sampling it a given number of times with replacement.
1319
1320 x = number of items in the set (the number of unique fragments in the library)
1321
1322 c = number of unique items (unique read pairs observed)
1323
1324 n = number of items samples (total number of read pairs)
1325
1326 c and n are known; x is unknown.
1327
1328 As n -> infinity, the coverage (c/x) can be given as:
1329
1330 c / x = 1 - exp(-n / x) (see https://math.stackexchange.com/questions/32800)
1331
1332 This needs to be solved for x, so it is rearranged to put both terms on the
1333 left side and estimate_library_size() finds a value of x which gives a
1334 result of zero (or as close as it can get).
1335 */
coverage_equation(double x,double c,double n)1336 static inline double coverage_equation(double x, double c, double n) {
1337 return c / x - 1 + exp(-n / x);
1338 }
1339
1340
1341 /* estimate the library size, based on the Picard code in DuplicationMetrics.java*/
estimate_library_size(unsigned long paired_reads,unsigned long paired_duplicate_reads,unsigned long optical)1342 static unsigned long estimate_library_size(unsigned long paired_reads, unsigned long paired_duplicate_reads, unsigned long optical) {
1343 unsigned long estimated_size = 0;
1344 unsigned long non_optical_pairs = (paired_reads - optical) / 2;
1345 unsigned long unique_pairs = (paired_reads - paired_duplicate_reads) / 2;
1346 unsigned long duplicate_pairs = (paired_duplicate_reads - optical) / 2;
1347
1348 if ((non_optical_pairs && duplicate_pairs && unique_pairs) && (non_optical_pairs > duplicate_pairs)) {
1349 double m = 1;
1350 double M = 100;
1351 int i;
1352
1353 if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) < 0) {
1354 fprintf(stderr, "[markdup] warning: unable to calculate estimated library size.\n");
1355 return estimated_size;
1356 }
1357
1358 while (coverage_equation(M * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) > 0) {
1359 M *= 10;
1360 }
1361
1362 for (i = 0; i < 40; i++) {
1363 double r = (m + M) / 2;
1364 double u = coverage_equation(r * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs);
1365
1366 if (u > 0) {
1367 m = r;
1368 } else if (u < 0) {
1369 M = r;
1370 } else {
1371 break;
1372 }
1373 }
1374
1375 estimated_size = (unsigned long)(unique_pairs * (m + M) / 2);
1376 } else {
1377 fprintf(stderr, "[markdup] warning: unable to calculate estimated library size."
1378 " Read pairs %ld should be greater than duplicate pairs %ld,"
1379 " which should both be non zero.\n",
1380 non_optical_pairs, duplicate_pairs);
1381 }
1382
1383 return estimated_size;
1384 }
1385
1386
1387 /* Compare the reads near each other (coordinate sorted) and try to spot the duplicates.
1388 Generally the highest quality scoring is chosen as the original and all others the duplicates.
1389 The score is based on the sum of the quality values (<= 15) of the read and its mate (if any).
1390 While single reads are compared to only one read of a pair, the pair will chosen as the original.
1391 The comparison is done on position and orientation, see above for details.
1392
1393 Marking the supplementary reads of a duplicate as also duplicates takes an extra file read/write
1394 step. This is because the duplicate can occur before the primary read.*/
1395
bam_mark_duplicates(md_param_t * param)1396 static int bam_mark_duplicates(md_param_t *param) {
1397 bam_hdr_t *header = NULL;
1398 khiter_t k;
1399 khash_t(reads) *pair_hash = kh_init(reads);
1400 khash_t(reads) *single_hash = kh_init(reads);
1401 klist_t(read_queue) *read_buffer = kl_init(read_queue);
1402 kliter_t(read_queue) *rq;
1403 khash_t(duplicates) *dup_hash = kh_init(duplicates);
1404 int32_t prev_tid;
1405 hts_pos_t prev_coord;
1406 read_queue_t *in_read;
1407 int ret;
1408 long reading, writing, excluded, duplicate, single, pair, single_dup, examined, optical, single_optical;
1409 long np_duplicate, np_opt_duplicate;
1410 long opt_warnings = 0;
1411 tmp_file_t temp;
1412 char *idx_fn = NULL;
1413 int exclude = 0;
1414 check_list_t dup_list = {NULL, 0, 0};
1415
1416 if (!pair_hash || !single_hash || !read_buffer || !dup_hash) {
1417 fprintf(stderr, "[markdup] out of memory\n");
1418 goto fail;
1419 }
1420
1421 if ((header = sam_hdr_read(param->in)) == NULL) {
1422 fprintf(stderr, "[markdup] error reading header\n");
1423 goto fail;
1424 }
1425
1426 // accept unknown, unsorted or coordinate sort order, but error on queryname sorted.
1427 // only really works on coordinate sorted files.
1428 kstring_t str = KS_INITIALIZE;
1429 if (!sam_hdr_find_tag_hd(header, "SO", &str) && str.s && !strcmp(str.s, "queryname")) {
1430 fprintf(stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n");
1431 ks_free(&str);
1432 goto fail;
1433 }
1434 ks_free(&str);
1435
1436 if (!param->no_pg && sam_hdr_add_pg(header, "samtools", "VN", samtools_version(),
1437 param->arg_list ? "CL" : NULL,
1438 param->arg_list ? param->arg_list : NULL,
1439 NULL) != 0) {
1440 fprintf(stderr, "[markdup] warning: unable to add @PG line to header.\n");
1441 }
1442
1443 if (sam_hdr_write(param->out, header) < 0) {
1444 fprintf(stderr, "[markdup] error writing header.\n");
1445 goto fail;
1446 }
1447 if (param->write_index) {
1448 if (!(idx_fn = auto_index(param->out, param->out_fn, header)))
1449 goto fail;
1450 }
1451
1452 // used for coordinate order checks
1453 prev_tid = prev_coord = 0;
1454
1455 // get the buffer going
1456 in_read = kl_pushp(read_queue, read_buffer);
1457 if (!in_read) {
1458 fprintf(stderr, "[markdup] out of memory\n");
1459 goto fail;
1460 }
1461
1462 // handling supplementary reads needs a temporary file
1463 if (param->supp) {
1464 if (tmp_file_open_write(&temp, param->prefix, 1)) {
1465 fprintf(stderr, "[markdup] error: unable to open tmp file %s.\n", param->prefix);
1466 goto fail;
1467 }
1468 }
1469
1470 if ((in_read->b = bam_init1()) == NULL) {
1471 fprintf(stderr, "[markdup] error: unable to allocate memory for alignment.\n");
1472 goto fail;
1473 }
1474
1475 if (param->check_chain && !(param->tag || param->opt_dist))
1476 param->check_chain = 0;
1477
1478 if (param->check_chain) {
1479 dup_list.size = 128;
1480 dup_list.c = NULL;
1481
1482 if ((dup_list.c = malloc(dup_list.size * sizeof(check_t))) == NULL) {
1483 fprintf(stderr, "[markdup] error: unable to allocate memory for dup_list.\n");
1484 goto fail;
1485 }
1486 }
1487
1488 reading = writing = excluded = single_dup = duplicate = examined = pair = single = optical = single_optical = 0;
1489 np_duplicate = np_opt_duplicate = 0;
1490
1491 while ((ret = sam_read1(param->in, header, in_read->b)) >= 0) {
1492 int dup_checked = 0;
1493
1494 // do some basic coordinate order checks
1495 if (in_read->b->core.tid >= 0) { // -1 for unmapped reads
1496 if (in_read->b->core.tid < prev_tid ||
1497 ((in_read->b->core.tid == prev_tid) && (in_read->b->core.pos < prev_coord))) {
1498 fprintf(stderr, "[markdup] error: not in coordinate sorted order.\n");
1499 goto fail;
1500 }
1501 }
1502
1503 prev_coord = in_read->pos = in_read->b->core.pos;
1504 prev_tid = in_read->b->core.tid;
1505 in_read->pair_key.single = 1;
1506 in_read->single_key.single = 0;
1507 in_read->duplicate = NULL;
1508 in_read->dup_checked = 0;
1509
1510 reading++;
1511
1512 if (param->clear && (in_read->b->core.flag & BAM_FDUP)) {
1513 uint8_t *data;
1514
1515 in_read->b->core.flag ^= BAM_FDUP;
1516
1517 if ((data = bam_aux_get(in_read->b, "dt")) != NULL) {
1518 bam_aux_del(in_read->b, data);
1519 }
1520
1521 if ((data = bam_aux_get(in_read->b, "do")) != NULL) {
1522 bam_aux_del(in_read->b, data);
1523 }
1524 }
1525
1526 if (param->include_fails) {
1527 exclude |= (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP);
1528 } else {
1529 exclude |= (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP | BAM_FQCFAIL);
1530 }
1531
1532 // read must not be secondary, supplementary, unmapped or (possibly) failed QC
1533 if (!(in_read->b->core.flag & exclude)) {
1534 examined++;
1535
1536
1537 // look at the pairs first
1538 if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) {
1539 int ret, mate_tmp;
1540 key_data_t pair_key;
1541 key_data_t single_key;
1542 in_hash_t *bp;
1543
1544 if (param->mode) {
1545 if (make_pair_key_sequence(&pair_key, in_read->b)) {
1546 fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n");
1547 goto fail;
1548 }
1549 } else {
1550 if (make_pair_key_template(&pair_key, in_read->b)) {
1551 fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n");
1552 goto fail;
1553 }
1554 }
1555
1556 make_single_key(&single_key, in_read->b);
1557
1558 pair++;
1559 in_read->pos = single_key.this_coord; // cigar/orientation modified pos
1560
1561 // put in singles hash for checking against non paired reads
1562 k = kh_put(reads, single_hash, single_key, &ret);
1563
1564 if (ret > 0) { // new
1565 // add to single duplicate hash
1566 bp = &kh_val(single_hash, k);
1567 bp->p = in_read;
1568 in_read->single_key = single_key;
1569 } else if (ret == 0) { // exists
1570 // look at singles only for duplication marking
1571 bp = &kh_val(single_hash, k);
1572
1573 if (!(bp->p->b->core.flag & BAM_FPAIRED) || (bp->p->b->core.flag & BAM_FMUNMAP)) {
1574 // singleton will always be marked duplicate even if
1575 // scores more than one read of the pair
1576 bam1_t *dup = bp->p->b;
1577
1578 if (param->check_chain)
1579 in_read->duplicate = bp->p;
1580
1581 bp->p = in_read;
1582
1583 if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings))
1584 goto fail;
1585
1586 single_dup++;
1587 }
1588 } else {
1589 fprintf(stderr, "[markdup] error: single hashing failure.\n");
1590 goto fail;
1591 }
1592
1593 // now do the pair
1594 k = kh_put(reads, pair_hash, pair_key, &ret);
1595
1596 if (ret > 0) { // new
1597 // add to the pair hash
1598 bp = &kh_val(pair_hash, k);
1599 bp->p = in_read;
1600 in_read->pair_key = pair_key;
1601 } else if (ret == 0) {
1602 int64_t old_score, new_score, tie_add = 0;
1603 bam1_t *dup = NULL;
1604
1605 bp = &kh_val(pair_hash, k);
1606
1607 if ((bp->p->b->core.flag & BAM_FQCFAIL) != (in_read->b->core.flag & BAM_FQCFAIL)) {
1608 if (bp->p->b->core.flag & BAM_FQCFAIL) {
1609 old_score = 0;
1610 new_score = 1;
1611 } else {
1612 old_score = 1;
1613 new_score = 0;
1614 }
1615 } else {
1616 if ((mate_tmp = get_mate_score(bp->p->b)) == -1) {
1617 fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
1618 goto fail;
1619 } else {
1620 old_score = calc_score(bp->p->b) + mate_tmp;
1621 }
1622
1623 if ((mate_tmp = get_mate_score(in_read->b)) == -1) {
1624 fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
1625 goto fail;
1626 } else {
1627 new_score = calc_score(in_read->b) + mate_tmp;
1628 }
1629 }
1630
1631 // choose the highest score as the original
1632 // and add it to the pair hash, mark the other as duplicate
1633
1634 if (new_score == old_score) {
1635 if (strcmp(bam_get_qname(in_read->b), bam_get_qname(bp->p->b)) < 0) {
1636 tie_add = 1;
1637 } else {
1638 tie_add = -1;
1639 }
1640 }
1641
1642 if (new_score + tie_add > old_score) { // swap reads
1643 dup = bp->p->b;
1644
1645 if (param->check_chain) {
1646
1647 if (in_read->duplicate) {
1648 read_queue_t *current = in_read->duplicate;
1649
1650 while (current->duplicate) {
1651 current = current->duplicate;
1652 }
1653
1654 current->duplicate = bp->p;
1655 } else {
1656 in_read->duplicate = bp->p;
1657 }
1658 }
1659
1660 bp->p = in_read;
1661 } else {
1662 if (param->check_chain) {
1663 if (bp->p->duplicate) {
1664 if (in_read->duplicate) {
1665 read_queue_t *current = bp->p->duplicate;
1666
1667 while (current->duplicate) {
1668 current = current->duplicate;
1669 }
1670
1671 current->duplicate = in_read->duplicate;
1672 }
1673
1674 in_read->duplicate = bp->p->duplicate;
1675 }
1676
1677 bp->p->duplicate = in_read;
1678 }
1679
1680 dup = in_read->b;
1681 }
1682
1683 if (mark_duplicates(param, dup_hash, bp->p->b, dup, &optical, &opt_warnings))
1684 goto fail;
1685
1686 duplicate++;
1687 } else {
1688 fprintf(stderr, "[markdup] error: pair hashing failure.\n");
1689 goto fail;
1690 }
1691 } else { // do the single (or effectively single) reads
1692 int ret;
1693 key_data_t single_key;
1694 in_hash_t *bp;
1695
1696 make_single_key(&single_key, in_read->b);
1697
1698 single++;
1699 in_read->pos = single_key.this_coord; // cigar/orientation modified pos
1700
1701 k = kh_put(reads, single_hash, single_key, &ret);
1702
1703 if (ret > 0) { // new
1704 bp = &kh_val(single_hash, k);
1705 bp->p = in_read;
1706 in_read->single_key = single_key;
1707 } else if (ret == 0) { // exists
1708 bp = &kh_val(single_hash, k);
1709
1710 if ((bp->p->b->core.flag & BAM_FPAIRED) && !(bp->p->b->core.flag & BAM_FMUNMAP)) {
1711 // if matched against one of a pair just mark as duplicate
1712
1713 if (param->check_chain) {
1714 if (bp->p->duplicate) {
1715 in_read->duplicate = bp->p->duplicate;
1716 }
1717
1718 bp->p->duplicate = in_read;
1719 }
1720
1721 if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings))
1722 goto fail;
1723
1724 } else {
1725 int64_t old_score, new_score;
1726 bam1_t *dup = NULL;
1727
1728 old_score = calc_score(bp->p->b);
1729 new_score = calc_score(in_read->b);
1730
1731 // choose the highest score as the original, add it
1732 // to the single hash and mark the other as duplicate
1733 if (new_score > old_score) { // swap reads
1734 dup = bp->p->b;
1735
1736 if (param->check_chain)
1737 in_read->duplicate = bp->p;
1738
1739 bp->p = in_read;
1740 } else {
1741 if (param->check_chain) {
1742 if (bp->p->duplicate) {
1743 in_read->duplicate = bp->p->duplicate;
1744 }
1745
1746 bp->p->duplicate = in_read;
1747 }
1748
1749 dup = in_read->b;
1750 }
1751
1752 if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings))
1753 goto fail;
1754 }
1755
1756 single_dup++;
1757 } else {
1758 fprintf(stderr, "[markdup] error: single hashing failure.\n");
1759 goto fail;
1760 }
1761 }
1762 } else {
1763 excluded++;
1764 }
1765
1766 // loop through the stored reads and write out those we
1767 // no longer need
1768 rq = kl_begin(read_buffer);
1769 while (rq != kl_end(read_buffer)) {
1770 in_read = &kl_val(rq);
1771
1772 /* keep a moving window of reads based on coordinates and max read length. Any unaligned reads
1773 should just be written as they cannot be matched as duplicates. */
1774 if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) {
1775 break;
1776 }
1777
1778 if (!dup_checked && param->check_chain) {
1779 // check for multiple optical duplicates of the same original read
1780
1781 if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 1)) {
1782 fprintf(stderr, "[markdup] error: duplicate checking failed.\n");
1783 goto fail;
1784 }
1785
1786 dup_checked = 1;
1787 }
1788
1789
1790 if (param->check_chain && (in_read->b->core.flag & BAM_FDUP) && !in_read->dup_checked && !(in_read->b->core.flag & exclude)) {
1791 break;
1792 }
1793
1794 if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
1795 if (param->supp) {
1796 if (tmp_file_write(&temp, in_read->b)) {
1797 fprintf(stderr, "[markdup] error: writing temp output failed.\n");
1798 goto fail;
1799 }
1800 } else {
1801 if (sam_write1(param->out, header, in_read->b) < 0) {
1802 fprintf(stderr, "[markdup] error: writing output failed.\n");
1803 goto fail;
1804 }
1805 }
1806
1807 writing++;
1808 }
1809
1810 // remove from hash
1811 if (in_read->pair_key.single == 0) {
1812 k = kh_get(reads, pair_hash, in_read->pair_key);
1813 kh_del(reads, pair_hash, k);
1814 }
1815
1816 if (in_read->single_key.single == 1) {
1817 k = kh_get(reads, single_hash, in_read->single_key);
1818 kh_del(reads, single_hash, k);
1819 }
1820
1821 kl_shift(read_queue, read_buffer, NULL);
1822 bam_destroy1(in_read->b);
1823 rq = kl_begin(read_buffer);
1824 }
1825
1826 // set the next one up for reading
1827 in_read = kl_pushp(read_queue, read_buffer);
1828 if (!in_read) {
1829 fprintf(stderr, "[markdup] out of memory\n");
1830 goto fail;
1831 }
1832
1833 if ((in_read->b = bam_init1()) == NULL) {
1834 fprintf(stderr, "[markdup] error: unable to allocate memory for alignment.\n");
1835 goto fail;
1836 }
1837 }
1838
1839 if (ret < -1) {
1840 fprintf(stderr, "[markdup] error: truncated input file.\n");
1841 goto fail;
1842 }
1843
1844 // one last check
1845 if (param->tag || param->opt_dist) {
1846 if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 0)) {
1847 fprintf(stderr, "[markdup] error: duplicate checking failed.\n");
1848 goto fail;
1849 }
1850 }
1851
1852 // write out the end of the list
1853 rq = kl_begin(read_buffer);
1854 while (rq != kl_end(read_buffer)) {
1855 in_read = &kl_val(rq);
1856
1857 if (bam_get_qname(in_read->b)) { // last entry will be blank
1858 if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
1859 if (param->supp) {
1860 if (tmp_file_write(&temp, in_read->b)) {
1861 fprintf(stderr, "[markdup] error: writing temp output failed.\n");
1862 goto fail;
1863 }
1864 } else {
1865 if (sam_write1(param->out, header, in_read->b) < 0) {
1866 fprintf(stderr, "[markdup] error: writing output failed.\n");
1867 goto fail;
1868 }
1869 }
1870
1871 writing++;
1872 }
1873 }
1874
1875 kl_shift(read_queue, read_buffer, NULL);
1876 bam_destroy1(in_read->b);
1877 rq = kl_begin(read_buffer);
1878 }
1879
1880 if (param->supp) {
1881 bam1_t *b;
1882
1883 if (tmp_file_end_write(&temp)) {
1884 fprintf(stderr, "[markdup] error: unable to end tmp writing.\n");
1885 goto fail;
1886 }
1887
1888 // read data from temp file and mark duplicate supplementary alignments
1889
1890 if (tmp_file_begin_read(&temp)) {
1891 goto fail;
1892 }
1893
1894 b = bam_init1();
1895
1896 while ((ret = tmp_file_read(&temp, b)) > 0) {
1897
1898 if ((b->core.flag & BAM_FSUPPLEMENTARY) || (b->core.flag & BAM_FUNMAP) || (b->core.flag & BAM_FSECONDARY)) {
1899
1900 k = kh_get(duplicates, dup_hash, bam_get_qname(b));
1901
1902 if (k != kh_end(dup_hash)) {
1903
1904 b->core.flag |= BAM_FDUP;
1905 np_duplicate++;
1906
1907 if (param->tag && kh_val(dup_hash, k).name) {
1908 if (bam_aux_update_str(b, "do", strlen(kh_val(dup_hash, k).name) + 1, (char*)kh_val(dup_hash, k).name)) {
1909 fprintf(stderr, "[markdup] error: unable to append supplementary 'do' tag.\n");
1910 goto fail;
1911 }
1912 }
1913
1914 if (param->opt_dist) {
1915 if (kh_val(dup_hash, k).type) {
1916 bam_aux_update_str(b, "dt", 3, "SQ");
1917 np_opt_duplicate++;
1918 } else {
1919 bam_aux_update_str(b, "dt", 3, "LB");
1920 }
1921 }
1922 }
1923 }
1924
1925 if (!param->remove_dups || !(b->core.flag & BAM_FDUP)) {
1926 if (sam_write1(param->out, header, b) < 0) {
1927 fprintf(stderr, "[markdup] error: writing final output failed.\n");
1928 goto fail;
1929 }
1930 }
1931 }
1932
1933 if (ret == -1) {
1934 fprintf(stderr, "[markdup] error: failed to read tmp file.\n");
1935 goto fail;
1936 }
1937
1938 for (k = kh_begin(dup_hash); k != kh_end(dup_hash); ++k) {
1939 if (kh_exist(dup_hash, k)) {
1940 free(kh_val(dup_hash, k).name);
1941 free((char *)kh_key(dup_hash, k));
1942 kh_key(dup_hash, k) = NULL;
1943 }
1944 }
1945
1946 tmp_file_destroy(&temp);
1947 bam_destroy1(b);
1948 }
1949
1950 if (opt_warnings) {
1951 fprintf(stderr, "[markdup] warning: number of failed attempts to get coordinates from read names = %ld\n",
1952 opt_warnings);
1953 }
1954
1955 if (param->do_stats) {
1956 FILE *fp;
1957 int file_open = 0;
1958 unsigned long els;
1959
1960 if (param->stats_file) {
1961 if (NULL == (fp = fopen(param->stats_file, "w"))) {
1962 fprintf(stderr, "[markdup] warning: cannot write stats to %s.\n", param->stats_file);
1963 fp = stderr;
1964 } else {
1965 file_open = 1;
1966 }
1967 } else {
1968 fp = stderr;
1969 }
1970
1971 els = estimate_library_size(pair, duplicate, optical);
1972
1973 fprintf(fp,
1974 "COMMAND: %s\n"
1975 "READ: %ld\n"
1976 "WRITTEN: %ld\n"
1977 "EXCLUDED: %ld\n"
1978 "EXAMINED: %ld\n"
1979 "PAIRED: %ld\n"
1980 "SINGLE: %ld\n"
1981 "DUPLICATE PAIR: %ld\n"
1982 "DUPLICATE SINGLE: %ld\n"
1983 "DUPLICATE PAIR OPTICAL: %ld\n"
1984 "DUPLICATE SINGLE OPTICAL: %ld\n"
1985 "DUPLICATE NON PRIMARY: %ld\n"
1986 "DUPLICATE NON PRIMARY OPTICAL: %ld\n"
1987 "DUPLICATE PRIMARY TOTAL: %ld\n"
1988 "DUPLICATE TOTAL: %ld\n"
1989 "ESTIMATED_LIBRARY_SIZE: %ld\n", param->arg_list, reading, writing, excluded, examined, pair, single,
1990 duplicate, single_dup, optical, single_optical, np_duplicate, np_opt_duplicate,
1991 single_dup + duplicate, single_dup + duplicate + np_duplicate, els);
1992
1993 if (file_open) {
1994 fclose(fp);
1995 }
1996 }
1997
1998 if (param->write_index) {
1999 if (sam_idx_save(param->out) < 0) {
2000 print_error_errno("markdup", "writing index failed");
2001 goto fail;
2002 }
2003 }
2004
2005 if (param->check_chain && (param->tag || param->opt_dist))
2006 free(dup_list.c);
2007
2008 kh_destroy(reads, pair_hash);
2009 kh_destroy(reads, single_hash);
2010 kl_destroy(read_queue, read_buffer);
2011 kh_destroy(duplicates, dup_hash);
2012 sam_hdr_destroy(header);
2013
2014 return 0;
2015
2016 fail:
2017 for (rq = kl_begin(read_buffer); rq != kl_end(read_buffer); rq = kl_next(rq))
2018 bam_destroy1(kl_val(rq).b);
2019 kl_destroy(read_queue, read_buffer);
2020
2021 for (k = kh_begin(dup_hash); k != kh_end(dup_hash); ++k) {
2022 if (kh_exist(dup_hash, k)) {
2023 free((char *)kh_key(dup_hash, k));
2024 }
2025 }
2026 kh_destroy(duplicates, dup_hash);
2027
2028 if (param->check_chain && (param->tag || param->opt_dist))
2029 free(dup_list.c);
2030
2031 kh_destroy(reads, pair_hash);
2032 kh_destroy(reads, single_hash);
2033 sam_hdr_destroy(header);
2034 return 1;
2035 }
2036
2037
markdup_usage(void)2038 static int markdup_usage(void) {
2039 fprintf(stderr, "\n");
2040 fprintf(stderr, "Usage: samtools markdup <input.bam> <output.bam>\n\n");
2041 fprintf(stderr, "Option: \n");
2042 fprintf(stderr, " -r Remove duplicate reads\n");
2043 fprintf(stderr, " -l INT Max read length (default 300 bases)\n");
2044 fprintf(stderr, " -S Mark supplementary alignments of duplicates as duplicates (slower).\n");
2045 fprintf(stderr, " -s Report stats.\n");
2046 fprintf(stderr, " -f NAME Write stats to named file. Implies -s.\n");
2047 fprintf(stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n");
2048 fprintf(stderr, " -d INT Optical distance (if set, marks with dt tag)\n");
2049 fprintf(stderr, " -c Clear previous duplicate settings and tags.\n");
2050 fprintf(stderr, " -m --mode TYPE Duplicate decision method for paired reads.\n"
2051 " TYPE = t measure positions based on template start/end (default).\n"
2052 " s measure positions based on sequence start.\n");
2053 fprintf(stderr, " -n Reduce optical duplicate accuracy (faster results with many duplicates).\n");
2054 fprintf(stderr, " -u Output uncompressed data\n");
2055 fprintf(stderr, " --include-fails Include quality check failed reads.\n");
2056 fprintf(stderr, " --no-PG Do not add a PG line\n");
2057 fprintf(stderr, " --no-multi-dup Reduced duplicates of duplicates checking.\n");
2058 fprintf(stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag."
2059 " Mainly for information and debugging.\n");
2060
2061 sam_global_opt_help(stderr, "-.O..@..");
2062
2063 fprintf(stderr, "\nThe input file must be coordinate sorted and must have gone"
2064 " through fixmates with the mate scoring option on.\n");
2065
2066 return 1;
2067 }
2068
2069
bam_markdup(int argc,char ** argv)2070 int bam_markdup(int argc, char **argv) {
2071 int c, ret;
2072 char wmode[4] = {'w', 'b', 0, 0};
2073 sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
2074 htsThreadPool p = {NULL, 0};
2075 kstring_t tmpprefix = {0, 0, NULL};
2076 struct stat st;
2077 unsigned int t;
2078 md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NULL, NULL, NULL};
2079
2080 static const struct option lopts[] = {
2081 SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
2082 {"include-fails", no_argument, NULL, 1001},
2083 {"no-PG", no_argument, NULL, 1002},
2084 {"mode", required_argument, NULL, 'm'},
2085 {"no-multi-dup", no_argument, NULL, 1003},
2086 {NULL, 0, NULL, 0}
2087 };
2088
2089 while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:cm:u", lopts, NULL)) >= 0) {
2090 switch (c) {
2091 case 'r': param.remove_dups = 1; break;
2092 case 'l': param.max_length = atoi(optarg); break;
2093 case 's': param.do_stats = 1; break;
2094 case 'T': kputs(optarg, &tmpprefix); break;
2095 case 'S': param.supp = 1; break;
2096 case 't': param.tag = 1; break;
2097 case 'f': param.stats_file = optarg; param.do_stats = 1; break;
2098 case 'd': param.opt_dist = atoi(optarg); break;
2099 case 'c': param.clear = 1; break;
2100 case 'm':
2101 if (strcmp(optarg, "t") == 0) {
2102 param.mode = MD_MODE_TEMPLATE;
2103 } else if (strcmp(optarg, "s") == 0) {
2104 param.mode = MD_MODE_SEQUENCE;
2105 } else {
2106 fprintf(stderr, "[markdup] error: unknown mode '%s'.\n", optarg);
2107 return markdup_usage();
2108 }
2109
2110 break;
2111 case 'u': wmode[2] = '0'; break;
2112 case 1001: param.include_fails = 1; break;
2113 case 1002: param.no_pg = 1; break;
2114 case 1003: param.check_chain = 0; break;
2115 default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
2116 /* else fall-through */
2117 case '?': return markdup_usage();
2118 }
2119 }
2120
2121 if (optind + 2 > argc)
2122 return markdup_usage();
2123
2124 if (param.opt_dist < 0) param.opt_dist = 0;
2125 if (param.max_length < 0) param.max_length = 300;
2126
2127 param.in = sam_open_format(argv[optind], "r", &ga.in);
2128
2129 if (!param.in) {
2130 print_error_errno("markdup", "failed to open \"%s\" for input", argv[optind]);
2131 return 1;
2132 }
2133
2134 sam_open_mode(wmode + 1, argv[optind + 1], NULL);
2135 param.out = sam_open_format(argv[optind + 1], wmode, &ga.out);
2136
2137 if (!param.out) {
2138 print_error_errno("markdup", "failed to open \"%s\" for output", argv[optind + 1]);
2139 return 1;
2140 }
2141
2142 if (ga.nthreads > 0) {
2143 if (!(p.pool = hts_tpool_init(ga.nthreads))) {
2144 fprintf(stderr, "[markdup] error creating thread pool\n");
2145 return 1;
2146 }
2147
2148 hts_set_opt(param.in, HTS_OPT_THREAD_POOL, &p);
2149 hts_set_opt(param.out, HTS_OPT_THREAD_POOL, &p);
2150 }
2151
2152 // actual stuff happens here
2153
2154 // we need temp files so fix up the name here
2155 if (tmpprefix.l == 0) {
2156
2157 if (strcmp(argv[optind + 1], "-") != 0)
2158 ksprintf(&tmpprefix, "%s.", argv[optind + 1]);
2159 else
2160 kputc('.', &tmpprefix);
2161 }
2162
2163 if (stat(tmpprefix.s, &st) == 0 && S_ISDIR(st.st_mode)) {
2164 if (tmpprefix.s[tmpprefix.l-1] != '/') kputc('/', &tmpprefix);
2165 }
2166
2167 t = ((unsigned) time(NULL)) ^ ((unsigned) clock());
2168 ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000);
2169 param.prefix = tmpprefix.s;
2170
2171 param.arg_list = stringify_argv(argc + 1, argv - 1);
2172 param.write_index = ga.write_index;
2173 param.out_fn = argv[optind + 1];
2174
2175 ret = bam_mark_duplicates(¶m);
2176
2177 sam_close(param.in);
2178
2179 if (sam_close(param.out) < 0) {
2180 fprintf(stderr, "[markdup] error closing output file\n");
2181 ret = 1;
2182 }
2183
2184 if (p.pool) hts_tpool_destroy(p.pool);
2185
2186 free(param.arg_list);
2187 free(tmpprefix.s);
2188 sam_global_args_free(&ga);
2189
2190 return ret;
2191 }
2192