1#!/usr/bin/env perl 2# 3# 4# Script to convert GERALD export files to SAM format. 5# 6# 7# 8########## License: 9# 10# The MIT License 11# 12# Original SAMtools version 0.1.2 copyright (c) 2008-2009 Genome Research Ltd. 13# Modifications from version 0.1.2 to 2.0.0 copyright (c) 2010 Illumina, Inc. 14# 15# Permission is hereby granted, free of charge, to any person obtaining a copy 16# of this software and associated documentation files (the "Software"), to deal 17# in the Software without restriction, including without limitation the rights 18# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 19# copies of the Software, and to permit persons to whom the Software is 20# furnished to do so, subject to the following conditions: 21# 22# The above copyright notice and this permission notice shall be included in 23# all copies or substantial portions of the Software. 24# 25# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 26# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 27# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 28# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 29# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 30# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 31# THE SOFTWARE. 32# 33# 34# 35########## ChangeLog: 36# 37# Version: 2.0.0 (15FEB2010) 38# Script updated by Illumina in conjunction with CASAVA 1.7.0 release. 39# Major changes are as follows: 40# - The CIGAR string has been updated to include all gaps from ELANDv2 alignments. 41# - The ELAND single read alignment score is always stored in the optional "SM" field 42# and the ELAND paired read alignment score is stored in the optional "AS" field 43# when it exists. 44# - The MAPQ value is set to the higher of the two alignment scores, but no greater 45# than 254, i.e. min(254,max(SM,AS)) 46# - The SAM "proper pair" bit (0x0002) is now set for read pairs meeting ELAND's 47# expected orientation and insert size criteria. 48# - The default quality score translation is set for export files which contain 49# Phread+64 quality values. An option, "--qlogodds", has been added to 50# translate quality values from the Solexa+64 format used in export files prior 51# to Pipeline 1.3 52# - The export match descriptor is now reverse-complemented when necessary such that 53# it always corresponds to the forward strand of the reference, to be consistent 54# with other information in the SAM record. It is now written to the optional 55# 'XD' field (rather than 'MD') to acknowledge its minor differences from the 56# samtools match descriptor (see additional detail below). 57# - An option, "--nofilter", has been added to include reads which have failed 58# primary analysis quality filtration. Such reads will have the corresponding 59# SAM flag bit (0x0200) set. 60# - Labels in the export 'contig' field are preserved by setting RNAME to 61# "$export_chromosome/$export_contig" when then contig label exists. 62# 63# 64# Contact: lh3 65# Version: 0.1.2 (03JAN2009) 66# 67# 68# 69########## Known Conversion Limitations: 70# 71# - Export records for reads that map to a position < 1 (allowed in export format), are converted 72# to unmapped reads in the SAM record. 73# - Export records contain the reserved chromosome names: "NM" and "QC". "NM" indicates that the 74# aligner could not map the read to the reference sequence set, and "QC" means that the 75# aligner did not attempt to map the read due to some technical limitation. Both of these 76# alignment types are collapsed to the single unmapped alignment state in the SAM record. 77# - The export match descriptor is slightly different than the samtools match descriptor. For 78# this reason it is stored in the optional SAM field 'XD' (and not 'MD'). Note that the 79# export match descriptor differs from the samtools version in two respects: (1) indels 80# are explicitly closed with the '$' character and (2) insertions must be enumerated in 81# the match descriptor. For example a 35-base read with a two-base insertion is described 82# as: 20^2$14 83# 84# 85# 86 87my $version = "2.0.0"; 88 89use strict; 90use warnings; 91 92use File::Spec qw(splitpath); 93use Getopt::Long; 94use List::Util qw(min max); 95 96 97use constant { 98 EXPORT_INDEX => 6, 99 EXPORT_READNO => 7, 100 EXPORT_READ => 8, 101 EXPORT_QUAL => 9, 102 EXPORT_CHROM => 10, 103 EXPORT_CONTIG => 11, 104 EXPORT_POS => 12, 105 EXPORT_STRAND => 13, 106 EXPORT_MD => 14, 107 EXPORT_SEMAP => 15, 108 EXPORT_PEMAP => 16, 109 EXPORT_PASSFILT => 21, 110}; 111 112 113use constant { 114 SAM_QNAME => 0, 115 SAM_FLAG => 1, 116 SAM_RNAME => 2, 117 SAM_POS => 3, 118 SAM_MAPQ => 4, 119 SAM_CIGAR => 5, 120 SAM_MRNM => 6, 121 SAM_MPOS => 7, 122 SAM_ISIZE => 8, 123 SAM_SEQ => 9, 124 SAM_QUAL => 10, 125}; 126 127 128# function prototypes for Richard's code 129sub match_desc_to_cigar($); 130sub match_desc_frag_length($); 131sub reverse_compl_match_descriptor($); 132sub write_header($;$;$); 133 134 135&export2sam; 136exit; 137 138 139 140 141sub export2sam { 142 143 my $cmdline = $0 . " " . join(" ",@ARGV); 144 my $arg_count = scalar @ARGV; 145 my @spval = File::Spec->splitpath($0); 146 my $progname = $spval[2]; 147 148 my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values 149 my $is_nofilter = 0; 150 my $read1file; 151 my $read2file; 152 my $print_version = 0; 153 my $help = 0; 154 155 my $result = GetOptions( "qlogodds" => \$is_logodds_qvals, 156 "nofilter" => \$is_nofilter, 157 "read1=s" => \$read1file, 158 "read2=s" => \$read2file, 159 "version" => \$print_version, 160 "help" => \$help ); 161 162 my $usage = <<END; 163 164$progname converts GERALD export files to SAM format. 165 166Usage: $progname --read1=FILENAME [ options ] | --version | --help 167 168 --read1=FILENAME read1 export file (mandatory) 169 --read2=FILENAME read2 export file 170 --nofilter include reads that failed the pipeline/RTA purity filter 171 --qlogodds assume export file(s) use logodds quality values as reported 172 by pipeline prior to v1.3 (default: phred quality values) 173 174END 175 176 my $version_msg = <<END; 177 178$progname version: $version 179 180END 181 182 if((not $result) or $help or ($arg_count==0)) { 183 die($usage); 184 } 185 186 if(@ARGV) { 187 print STDERR "\nERROR: Unrecognized arguments: " . join(" ",@ARGV) . "\n\n"; 188 die($usage); 189 } 190 191 if($print_version) { 192 die($version_msg); 193 } 194 195 if(not defined($read1file)) { 196 print STDERR "\nERROR: read1 export file must be specified\n\n"; 197 die($usage); 198 } 199 200 my ($fh1, $fh2, $is_paired); 201 202 open($fh1, $read1file) or die("\nERROR: Can't open read1 export file: $read1file\n\n"); 203 $is_paired = defined $read2file; 204 if ($is_paired) { 205 open($fh2, $read2file) or die("\nERROR: Can't open read2 export file: $read2file\n\n"); 206 } 207 # quality value conversion table 208 my @conv_table; 209 if($is_logodds_qvals){ # convert from solexa+64 quality values (pipeline pre-v1.3): 210 for (-64..64) { 211 $conv_table[$_+64] = int(33 + 10*log(1+10**($_/10.0))/log(10)+.499); 212 } 213 } else { # convert from phred+64 quality values (pipeline v1.3+): 214 for (-64..-1) { 215 $conv_table[$_+64] = undef; 216 } 217 for (0..64) { 218 $conv_table[$_+64] = int(33 + $_); 219 } 220 } 221 # write the header 222 print write_header( $progname, $version, $cmdline ); 223 # core loop 224 my $export_line_count = 0; 225 while (<$fh1>) { 226 $export_line_count++; 227 my (@s1, @s2); 228 &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter); 229 if ($is_paired) { 230 my $read2line = <$fh2>; 231 if(not $read2line){ 232 die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n"); 233 } 234 &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter); 235 236 if (@s1 && @s2) { # then set mate coordinate 237 if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){ 238 die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n"); 239 } 240 241 my $isize = 0; 242 if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize 243 my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS]; 244 my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS]; 245 $isize = $x2 - $x1; 246 } 247 248 foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){ 249 my ($sa,$sb,$is) = @{$_}; 250 if ($sb->[SAM_RNAME] ne '*') { 251 $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME]; 252 $sa->[SAM_MPOS] = $sb->[SAM_POS]; 253 $sa->[SAM_ISIZE] = $is; 254 $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10); 255 } else { 256 $sa->[SAM_FLAG] |= 0x8; 257 } 258 } 259 } 260 } 261 print join("\t", @s1), "\n" if (@s1); 262 print join("\t", @s2), "\n" if (@s2 && $is_paired); 263 } 264 close($fh1); 265 if($is_paired) { 266 while(my $read2line = <$fh2>){ 267 $export_line_count++; 268 die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n"); 269 } 270 close($fh2); 271 } 272} 273 274sub export2sam_aux { 275 my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_; 276 chomp($line); 277 my @t = split("\t", $line); 278 @$s = (); 279 my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y'); 280 return if(not ($isPassFilt or $is_nofilter)); 281 # read name 282 $s->[SAM_QNAME] = $t[1]? "$t[0]_$t[1]:$t[2]:$t[3]:$t[4]:$t[5]" : "$t[0]:$t[2]:$t[3]:$t[4]:$t[5]"; 283 # initial flag (will be updated later) 284 $s->[SAM_FLAG] = 0; 285 if($is_paired) { 286 if($t[EXPORT_READNO] != $read_no){ 287 die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n"); 288 } 289 $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no); 290 } 291 $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt); 292 293 # read & quality 294 my $is_export_rev = ($t[EXPORT_STRAND] eq 'R'); 295 if ($is_export_rev) { # then reverse the sequence and quality 296 $s->[SAM_SEQ] = reverse($t[EXPORT_READ]); 297 $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/; 298 $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]); 299 } else { 300 $s->[SAM_SEQ] = $t[EXPORT_READ]; 301 $s->[SAM_QUAL] = $t[EXPORT_QUAL]; 302 } 303 my @convqual = (); 304 foreach (unpack('C*', $s->[SAM_QUAL])){ 305 my $val=$ct->[$_]; 306 if(not defined $val){ 307 my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n"; 308 if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; } 309 die($msg . "\n"); 310 } 311 push @convqual,$val; 312 } 313 314 $s->[SAM_QUAL] = pack('C*',@convqual); # change coding 315 316 317 # coor 318 my $has_coor = 0; 319 $s->[SAM_RNAME] = "*"; 320 if ($t[EXPORT_CHROM] eq 'NM' or $t[EXPORT_CHROM] eq 'QC') { 321 $s->[SAM_FLAG] |= 0x4; # unmapped 322 } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) { 323 $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? 324 push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") 325 } elsif ($t[EXPORT_POS] < 1) { 326 $s->[SAM_FLAG] |= 0x4; # unmapped 327 } else { 328 $s->[SAM_RNAME] = $t[EXPORT_CHROM]; 329 $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne ''); 330 $has_coor = 1; 331 } 332 $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0; 333 334# print STDERR "t[14] = " . $t[14] . "\n"; 335 my $matchDesc = ''; 336 $s->[SAM_CIGAR] = "*"; 337 if($has_coor){ 338 $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD]; 339 340 if($matchDesc =~ /\^/){ 341 # construct CIGAR string using Richard's function 342 $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing 343 } else { 344 $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M"; 345 } 346 } 347 348# print STDERR "cigar_string = $cigar_string\n"; 349 350 $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev); 351 if($has_coor){ 352 my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0; 353 my $pemap = 0; 354 if($is_paired) { 355 $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0; 356 357 # set `proper pair' bit if non-blank, non-zero PE alignment score: 358 $s->[SAM_FLAG] |= 0x02 if ($pemap > 0); 359 } 360 $s->[SAM_MAPQ] = min(254,max($semap,$pemap)); 361 } else { 362 $s->[SAM_MAPQ] = 0; 363 } 364 # mate coordinate 365 $s->[SAM_MRNM] = '*'; 366 $s->[SAM_MPOS] = 0; 367 $s->[SAM_ISIZE] = 0; 368 # aux 369 push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]); 370 if($has_coor){ 371 # The export match descriptor differs slightly from the samtools match descriptor. 372 # In order for the converted SAM files to be as compliant as possible, 373 # we put the export match descriptor in optional field 'XD' rather than 'MD': 374 push(@$s, "XD:Z:$matchDesc"); 375 push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne ''); 376 push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne '')); 377 } 378} 379 380 381 382# 383# the following code is taken from Richard Shaw's sorted2sam.pl file 384# 385sub reverse_compl_match_descriptor($) 386{ 387# print "\nREVERSING THE MATCH DESCRIPTOR!\n"; 388 my ($match_desc) = @_; 389 my $rev_compl_match_desc = reverse($match_desc); 390 $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/; 391 392 # Unreverse the digits of numbers. 393 $rev_compl_match_desc = join('', 394 map {($_ =~ /\d+/) 395 ? join('', reverse(split('', $_))) 396 : $_} split(/(\d+)/, 397 $rev_compl_match_desc)); 398 399 return $rev_compl_match_desc; 400} 401 402 403 404sub match_desc_to_cigar($) 405{ 406 my ($match_desc) = @_; 407 408 my @match_desc_parts = split(/(\^.*?\$)/, $match_desc); 409 my $cigar_str = ''; 410 my $cigar_del_ch = 'D'; 411 my $cigar_ins_ch = 'I'; 412 my $cigar_match_ch = 'M'; 413 414 foreach my $match_desc_part (@match_desc_parts) { 415 next if (!$match_desc_part); 416 417 if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) { 418 # Deletion 419 $cigar_str .= (length($1) . $cigar_del_ch); 420 } elsif ($match_desc_part =~ /^\^(\d+)\$$/) { 421 # Insertion 422 $cigar_str .= ($1 . $cigar_ins_ch); 423 } else { 424 $cigar_str .= (match_desc_frag_length($match_desc_part) 425 . $cigar_match_ch); 426 } 427 } 428 429 return $cigar_str; 430} 431 432 433#------------------------------------------------------------------------------ 434 435sub match_desc_frag_length($) 436 { 437 my ($match_desc_str) = @_; 438 my $len = 0; 439 440 my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str); 441 442 foreach my $match_desc_field (@match_desc_fields) { 443 next if ($match_desc_field eq ''); 444 445 $len += (($match_desc_field =~ /(\d+)/) 446 ? $1 : length($match_desc_field)); 447 } 448 449 return $len; 450} 451 452 453# argument holds the command line 454sub write_header($;$;$) 455{ 456 my ($progname,$version,$cl) = @_; 457 my $complete_header = ""; 458 $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n"; 459 460 return $complete_header; 461} 462