1#!/usr/local/bin/perl -w 2# 3# Copyright (C) 2013-2018 Genome Research Ltd. 4# 5# Author: James Bonfield <jkb@sanger.ac.uk> 6# 7# Permission is hereby granted, free of charge, to any person obtaining a copy 8# of this software and associated documentation files (the "Software"), to deal 9# in the Software without restriction, including without limitation the rights 10# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11# copies of the Software, and to permit persons to whom the Software is 12# furnished to do so, subject to the following conditions: 13# 14# The above copyright notice and this permission notice shall be included in 15# all copies or substantial portions of the Software. 16# 17# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23# DEALINGS IN THE SOFTWARE. 24 25# Compares two SAM files to report differences. 26# Optionally can skip header or ignore specific types of diff. 27 28use strict; 29use Getopt::Long; 30 31my %opts; 32GetOptions(\%opts, 'noqual', 'noaux', 'notemplate', 'unknownrg', 'nomd', 'partialmd=i', 'template-1', 'noflag', 'Baux'); 33 34my ($fn1, $fn2) = @ARGV; 35open(my $fd1, "<", $fn1) || die $!; 36open(my $fd2, "<", $fn2) || die $!; 37 38# Headers 39my ($c1,$c2)=(1,1); 40my (@hd1, @hd2, $ln1, $ln2); 41while (<$fd1>) { 42 if (/^@/) { 43 push(@hd1, $_); 44 } else { 45 $ln1 = $_; 46 last; 47 } 48 $c1++; 49} 50 51while (<$fd2>) { 52 if (/^@/) { 53 push(@hd2, $_); 54 } else { 55 $ln2 = $_; 56 last; 57 } 58 $c2++; 59} 60 61# FIXME: to do 62#print "@hd1\n"; 63#print "@hd2\n"; 64 65# Compare lines 66while ($ln1 && $ln2) { 67 $ln1 =~ s/\015?\012/\n/; 68 $ln2 =~ s/\015?\012/\n/; 69 chomp($ln1); 70 chomp($ln2); 71 72 # Java CRAM adds RG:Z:UNKNOWN when the read-group is absent 73 if (exists $opts{unknownrg}) { 74 $ln1 =~ s/\tRG:Z:UNKNOWN//; 75 $ln2 =~ s/\tRG:Z:UNKNOWN//; 76 } 77 78 if (exists $opts{nomd}) { 79 $ln1 =~ s/\tMD:Z:[A-Z0-9^]*//; 80 $ln2 =~ s/\tMD:Z:[A-Z0-9^]*//; 81 $ln1 =~ s/\tNM:i:\d+//; 82 $ln2 =~ s/\tNM:i:\d+//; 83 } 84 85 # Validate MD and NM only if partialmd & 'file' set, otherwise 86 # discard it. Ie: 87 # 88 # 1: if file 1 has NM/MD keep in file 2, otherwise discard from file2 89 # 2: if file 2 has NM/MD keep in file 1, otherwise discard from file1 90 # 3: if file 1 and file 2 both have NM/MD keep, otherwise discard. 91 if (exists $opts{partialmd}) { 92 if ($opts{partialmd} & 2) { 93 $ln1 =~ s/\tNM:i:\d+// unless ($ln2 =~ /\tNM:i:\d+/); 94 $ln1 =~ s/\tMD:Z:[A-Z0-9^]*// unless ($ln2 =~ /\tMD:Z:[A-Z0-9^]*/); 95 } 96 if ($opts{partialmd} & 1) { 97 $ln2 =~ s/\tNM:i:\d+// unless ($ln1 =~ /\tNM:i:\d+/); 98 $ln2 =~ s/\tMD:Z:[A-Z0-9^]*// unless ($ln1 =~ /\tMD:Z:[A-Z0-9^]*/); 99 } 100 } 101 102 my @ln1 = split("\t", $ln1); 103 my @ln2 = split("\t", $ln2); 104 105 # Fix BWA bug: unmapped data should have no alignments 106 if ($ln1[1] & 4) { $ln1[4] = 0; $ln1[5] = "*"; } 107 if ($ln2[1] & 4) { $ln2[4] = 0; $ln2[5] = "*"; } 108 109 # Canonicalise floating point numbers 110 map {s/^(..):f:(.*)/{"$1:f:".($2+0)}/e} @ln1[11..$#ln1]; 111 map {s/^(..):f:(.*)/{"$1:f:".($2+0)}/e} @ln2[11..$#ln2]; 112 113 114 if (exists $opts{Baux}) { 115 # Turn ??:H:<hex> into ??:B:c,<vals> so we can compare 116 # Cramtools.jar vs htslib encodings. Probably doable with (un)pack 117 map {s/^(..):H:(.*)/{join(",", "$1:B:C", map {hex $_} $2=~m:..:g)}/e} @ln1[11..$#ln1]; 118 map {s/^(..):H:(.*)/{join(",", "$1:B:C", map {hex $_} $2=~m:..:g)}/e} @ln2[11..$#ln2]; 119 120 # Canonicalise ??:B:? data series to be unsigned 121 map {s/^(..):B:c(,?)(.*)/{"$1:B:C$2".join(",",map {($_+256)&255} split(",",$3))}/e} @ln1[11..$#ln1]; 122 map {s/^(..):B:c(,?)(.*)/{"$1:B:C$2".join(",",map {($_+256)&255} split(",",$3))}/e} @ln2[11..$#ln2]; 123 124 map {s/^(..):B:s(,?)(.*)/{"$1:B:S$2".join(",",map {($_+65536)&65535} split(",",$3))}/e} @ln1[11..$#ln1]; 125 map {s/^(..):B:s(,?)(.*)/{"$1:B:S$2".join(",",map {($_+65536)&65535} split(",",$3))}/e} @ln2[11..$#ln2]; 126 127 map {s/^(..):B:i(,?)(.*)/{"$1:B:I$2".join(",",map {$_<0? ($_+4294967296) : $_} split(",",$3))}/e} @ln1[11..$#ln1]; 128 map {s/^(..):B:i(,?)(.*)/{"$1:B:I$2".join(",",map {$_<0? ($_+4294967296) : $_} split(",",$3))}/e} @ln2[11..$#ln2]; 129 } 130 131 # Rationalise order of auxiliary fields 132 if (exists $opts{noaux}) { 133 @ln1 = @ln1[0..10]; 134 @ln2 = @ln2[0..10]; 135 } else { 136 #my @a=@ln1[11..$#ln1];print "<<<@a>>>\n"; 137 @ln1[11..$#ln1] = sort @ln1[11..$#ln1]; 138 @ln2[11..$#ln2] = sort @ln2[11..$#ln2]; 139 } 140 141 if (exists $opts{noqual}) { 142 $ln1[10] = "*"; 143 $ln2[10] = "*"; 144 } 145 146 if (exists $opts{notemplate}) { 147 @ln1[6..8] = qw/* 0 0/; 148 @ln2[6..8] = qw/* 0 0/; 149 } 150 151 if (exists $opts{noflag}) { 152 $ln1[1] = 0; $ln2[1] = 0; 153 } 154 155 if (exists $opts{'template-1'}) { 156 if (abs($ln1[8] - $ln2[8]) == 1) { 157 $ln1[8] = $ln2[8]; 158 } 159 } 160 161 # Cram doesn't uppercase the reference 162 $ln1[9] = uc($ln1[9]); 163 $ln2[9] = uc($ln2[9]); 164 165 # Cram will populate a sequence string that starts as "*" 166 $ln2[9] = "*" if ($ln1[9] eq "*"); 167 168 # Fix 0<op> cigar fields 169 $ln1[5] =~ s/(\D|^)0\D/$1/g; 170 $ln1[5] =~ s/^$/*/g; 171 $ln2[5] =~ s/(\D|^)0\D/$1/g; 172 $ln2[5] =~ s/^$/*/g; 173 174 # Fix 10M10M cigar to 20M 175 $ln1[5] =~ s/(\d+)(\D)(\d+)(\2)/$1+$3.$2/e; 176 $ln2[5] =~ s/(\d+)(\D)(\d+)(\2)/$1+$3.$2/e; 177 178 if ("@ln1" ne "@ln2") { 179 print "Diff at lines $fn1:$c1, $fn2:$c2\n"; 180 my @s1 = split("","@ln1"); 181 my @s2 = split("","@ln2"); 182 my $ptr = ""; 183 for (my $i=0; $i < $#s1; $i++) { 184 if ($s1[$i] eq $s2[$i]) { 185 $ptr .= "-"; 186 } else { 187 last; 188 } 189 } 190 print "1\t@ln1\n2\t@ln2\n\t$ptr^\n\n"; 191 exit(1); 192 } 193 194 $ln1 = <$fd1>; 195 $ln2 = <$fd2>; 196 197 $c1++; $c2++; 198} 199 200if (defined($ln1)) { 201 print "EOF on $fn1\n"; 202 exit(1); 203} 204 205if (defined($ln2)) { 206 print "EOF on $fn2\n"; 207 exit(1); 208} 209 210close($fd1); 211close($fd2); 212 213exit(0); 214