1#!/usr/local/bin/perl -w
2#
3#    Copyright (C) 2013-2018 Genome Research Ltd.
4#
5#    Author: James Bonfield <jkb@sanger.ac.uk>
6#
7# Permission is hereby granted, free of charge, to any person obtaining a copy
8# of this software and associated documentation files (the "Software"), to deal
9# in the Software without restriction, including without limitation the rights
10# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11# copies of the Software, and to permit persons to whom the Software is
12# furnished to do so, subject to the following conditions:
13#
14# The above copyright notice and this permission notice shall be included in
15# all copies or substantial portions of the Software.
16#
17# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23# DEALINGS IN THE SOFTWARE.
24
25# Compares two SAM files to report differences.
26# Optionally can skip header or ignore specific types of diff.
27
28use strict;
29use Getopt::Long;
30
31my %opts;
32GetOptions(\%opts, 'noqual', 'noaux', 'notemplate', 'unknownrg', 'nomd', 'partialmd=i', 'template-1', 'noflag', 'Baux');
33
34my ($fn1, $fn2) = @ARGV;
35open(my $fd1, "<", $fn1) || die $!;
36open(my $fd2, "<", $fn2) || die $!;
37
38# Headers
39my ($c1,$c2)=(1,1);
40my (@hd1, @hd2, $ln1, $ln2);
41while (<$fd1>) {
42    if (/^@/) {
43        push(@hd1, $_);
44    } else {
45        $ln1 = $_;
46        last;
47    }
48    $c1++;
49}
50
51while (<$fd2>) {
52    if (/^@/) {
53        push(@hd2, $_);
54    } else {
55        $ln2 = $_;
56        last;
57    }
58    $c2++;
59}
60
61# FIXME: to do
62#print "@hd1\n";
63#print "@hd2\n";
64
65# Compare lines
66while ($ln1 && $ln2) {
67    $ln1 =~ s/\015?\012/\n/;
68    $ln2 =~ s/\015?\012/\n/;
69    chomp($ln1);
70    chomp($ln2);
71
72    # Java CRAM adds RG:Z:UNKNOWN when the read-group is absent
73    if (exists $opts{unknownrg}) {
74        $ln1 =~ s/\tRG:Z:UNKNOWN//;
75        $ln2 =~ s/\tRG:Z:UNKNOWN//;
76    }
77
78    if (exists $opts{nomd}) {
79        $ln1 =~ s/\tMD:Z:[A-Z0-9^]*//;
80        $ln2 =~ s/\tMD:Z:[A-Z0-9^]*//;
81        $ln1 =~ s/\tNM:i:\d+//;
82        $ln2 =~ s/\tNM:i:\d+//;
83    }
84
85    # Validate MD and NM only if partialmd & 'file' set, otherwise
86    # discard it.  Ie:
87    #
88    # 1: if file 1 has NM/MD keep in file 2, otherwise discard from file2
89    # 2: if file 2 has NM/MD keep in file 1, otherwise discard from file1
90    # 3: if file 1 and file 2 both have NM/MD keep, otherwise discard.
91    if (exists $opts{partialmd}) {
92        if ($opts{partialmd} & 2) {
93            $ln1 =~ s/\tNM:i:\d+//        unless ($ln2 =~ /\tNM:i:\d+/);
94            $ln1 =~ s/\tMD:Z:[A-Z0-9^]*// unless ($ln2 =~ /\tMD:Z:[A-Z0-9^]*/);
95        }
96        if ($opts{partialmd} & 1) {
97            $ln2 =~ s/\tNM:i:\d+//        unless ($ln1 =~ /\tNM:i:\d+/);
98            $ln2 =~ s/\tMD:Z:[A-Z0-9^]*// unless ($ln1 =~ /\tMD:Z:[A-Z0-9^]*/);
99        }
100    }
101
102    my @ln1 = split("\t", $ln1);
103    my @ln2 = split("\t", $ln2);
104
105    # Fix BWA bug: unmapped data should have no alignments
106    if ($ln1[1] & 4) { $ln1[4] = 0; $ln1[5] = "*"; }
107    if ($ln2[1] & 4) { $ln2[4] = 0; $ln2[5] = "*"; }
108
109    # Canonicalise floating point numbers
110    map {s/^(..):f:(.*)/{"$1:f:".($2+0)}/e} @ln1[11..$#ln1];
111    map {s/^(..):f:(.*)/{"$1:f:".($2+0)}/e} @ln2[11..$#ln2];
112
113
114    if (exists $opts{Baux}) {
115        # Turn ??:H:<hex> into ??:B:c,<vals> so we can compare
116        # Cramtools.jar vs htslib encodings.  Probably doable with (un)pack
117        map {s/^(..):H:(.*)/{join(",", "$1:B:C", map {hex $_} $2=~m:..:g)}/e} @ln1[11..$#ln1];
118        map {s/^(..):H:(.*)/{join(",", "$1:B:C", map {hex $_} $2=~m:..:g)}/e} @ln2[11..$#ln2];
119
120        # Canonicalise ??:B:? data series to be unsigned
121        map {s/^(..):B:c(,?)(.*)/{"$1:B:C$2".join(",",map {($_+256)&255} split(",",$3))}/e} @ln1[11..$#ln1];
122        map {s/^(..):B:c(,?)(.*)/{"$1:B:C$2".join(",",map {($_+256)&255} split(",",$3))}/e} @ln2[11..$#ln2];
123
124        map {s/^(..):B:s(,?)(.*)/{"$1:B:S$2".join(",",map {($_+65536)&65535} split(",",$3))}/e} @ln1[11..$#ln1];
125        map {s/^(..):B:s(,?)(.*)/{"$1:B:S$2".join(",",map {($_+65536)&65535} split(",",$3))}/e} @ln2[11..$#ln2];
126
127        map {s/^(..):B:i(,?)(.*)/{"$1:B:I$2".join(",",map {$_<0? ($_+4294967296) : $_} split(",",$3))}/e} @ln1[11..$#ln1];
128        map {s/^(..):B:i(,?)(.*)/{"$1:B:I$2".join(",",map {$_<0? ($_+4294967296) : $_} split(",",$3))}/e} @ln2[11..$#ln2];
129    }
130
131    # Rationalise order of auxiliary fields
132    if (exists $opts{noaux}) {
133        @ln1 = @ln1[0..10];
134        @ln2 = @ln2[0..10];
135    } else {
136        #my @a=@ln1[11..$#ln1];print "<<<@a>>>\n";
137        @ln1[11..$#ln1] = sort @ln1[11..$#ln1];
138        @ln2[11..$#ln2] = sort @ln2[11..$#ln2];
139    }
140
141    if (exists $opts{noqual}) {
142        $ln1[10] = "*";
143        $ln2[10] = "*";
144    }
145
146    if (exists $opts{notemplate}) {
147        @ln1[6..8] = qw/* 0 0/;
148        @ln2[6..8] = qw/* 0 0/;
149    }
150
151    if (exists $opts{noflag}) {
152        $ln1[1] = 0; $ln2[1] = 0;
153    }
154
155    if (exists $opts{'template-1'}) {
156        if (abs($ln1[8] - $ln2[8]) == 1) {
157            $ln1[8] = $ln2[8];
158        }
159    }
160
161    # Cram doesn't uppercase the reference
162    $ln1[9] = uc($ln1[9]);
163    $ln2[9] = uc($ln2[9]);
164
165    # Cram will populate a sequence string that starts as "*"
166    $ln2[9] = "*" if ($ln1[9] eq "*");
167
168    # Fix 0<op> cigar fields
169    $ln1[5] =~ s/(\D|^)0\D/$1/g;
170    $ln1[5] =~ s/^$/*/g;
171    $ln2[5] =~ s/(\D|^)0\D/$1/g;
172    $ln2[5] =~ s/^$/*/g;
173
174    # Fix 10M10M cigar to 20M
175    $ln1[5] =~ s/(\d+)(\D)(\d+)(\2)/$1+$3.$2/e;
176    $ln2[5] =~ s/(\d+)(\D)(\d+)(\2)/$1+$3.$2/e;
177
178    if ("@ln1" ne "@ln2") {
179        print "Diff at lines $fn1:$c1, $fn2:$c2\n";
180        my @s1 = split("","@ln1");
181        my @s2 = split("","@ln2");
182        my $ptr = "";
183        for (my $i=0; $i < $#s1; $i++) {
184            if ($s1[$i] eq $s2[$i]) {
185                $ptr .= "-";
186            } else {
187                last;
188            }
189        }
190        print "1\t@ln1\n2\t@ln2\n\t$ptr^\n\n";
191        exit(1);
192    }
193
194    $ln1 = <$fd1>;
195    $ln2 = <$fd2>;
196
197    $c1++; $c2++;
198}
199
200if (defined($ln1)) {
201    print "EOF on $fn1\n";
202    exit(1);
203}
204
205if (defined($ln2)) {
206    print "EOF on $fn2\n";
207    exit(1);
208}
209
210close($fd1);
211close($fd2);
212
213exit(0);
214