1#!/usr/bin/env perl
2#
3#
4# Script to convert GERALD export files to SAM format.
5#
6#
7#
8########## License:
9#
10# The MIT License
11#
12# Original SAMtools version 0.1.2 copyright (c) 2008-2009 Genome Research Ltd.
13# Modifications from version 0.1.2 to 2.0.0 copyright (c) 2010 Illumina, Inc.
14#
15# Permission is hereby granted, free of charge, to any person obtaining a copy
16# of this software and associated documentation files (the "Software"), to deal
17# in the Software without restriction, including without limitation the rights
18# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19# copies of the Software, and to permit persons to whom the Software is
20# furnished to do so, subject to the following conditions:
21#
22# The above copyright notice and this permission notice shall be included in
23# all copies or substantial portions of the Software.
24#
25# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
31# THE SOFTWARE.
32#
33#
34#
35########## ChangeLog:
36#
37# Version: 2.0.0 (15FEB2010)
38#   Script updated by Illumina in conjunction with CASAVA 1.7.0 release.
39#   Major changes are as follows:
40#   - The CIGAR string has been updated to include all gaps from ELANDv2 alignments.
41#   - The ELAND single read alignment score is always stored in the optional "SM" field
42#       and the ELAND paired read alignment score is stored in the optional "AS" field
43#       when it exists.
44#   - The MAPQ value is set to the higher of the two alignment scores, but no greater
45#       than 254,  i.e. min(254,max(SM,AS))
46#   - The SAM "proper pair" bit (0x0002) is now set for read pairs meeting ELAND's
47#       expected orientation and insert size criteria.
48#   - The default quality score translation is set for export files which contain
49#       Phread+64 quality values. An option, "--qlogodds", has been added to
50#       translate quality values from the Solexa+64 format used in export files prior
51#       to Pipeline 1.3
52#   - The export match descriptor is now reverse-complemented when necessary such that
53#       it always corresponds to the forward strand of the reference, to be consistent
54#       with other information in the SAM record. It is now written to the optional
55#       'XD' field (rather than 'MD') to acknowledge its minor differences from the
56#       samtools match descriptor (see additional detail below).
57#   - An option, "--nofilter", has been added to include reads which have failed
58#       primary analysis quality filtration. Such reads will have the corresponding
59#       SAM flag bit (0x0200) set.
60#   - Labels in the export 'contig' field are preserved by setting RNAME to
61#       "$export_chromosome/$export_contig" when then contig label exists.
62#
63#
64# Contact: lh3
65# Version: 0.1.2 (03JAN2009)
66#
67#
68#
69########## Known Conversion Limitations:
70#
71# - Export records for reads that map to a position < 1 (allowed in export format), are converted
72#     to unmapped reads in the SAM record.
73# - Export records contain the reserved chromosome names: "NM" and "QC". "NM" indicates that the
74#     aligner could not map the read to the reference sequence set, and "QC" means that the
75#     aligner did not attempt to map the read due to some technical limitation. Both of these
76#     alignment types are collapsed to the single unmapped alignment state in the SAM record.
77# - The export match descriptor is slightly different than the samtools match descriptor. For
78#     this reason it is stored in the optional SAM field 'XD' (and not 'MD'). Note that the
79#     export match descriptor differs from the samtools version in two respects: (1) indels
80#     are explicitly closed with the '$' character and (2) insertions must be enumerated in
81#     the match descriptor. For example a 35-base read with a two-base insertion is described
82#     as: 20^2$14
83#
84#
85#
86
87my $version = "2.0.0";
88
89use strict;
90use warnings;
91
92use File::Spec qw(splitpath);
93use Getopt::Long;
94use List::Util qw(min max);
95
96
97use constant {
98  EXPORT_INDEX => 6,
99  EXPORT_READNO => 7,
100  EXPORT_READ => 8,
101  EXPORT_QUAL => 9,
102  EXPORT_CHROM => 10,
103  EXPORT_CONTIG => 11,
104  EXPORT_POS => 12,
105  EXPORT_STRAND => 13,
106  EXPORT_MD => 14,
107  EXPORT_SEMAP => 15,
108  EXPORT_PEMAP => 16,
109  EXPORT_PASSFILT => 21,
110};
111
112
113use constant {
114  SAM_QNAME => 0,
115  SAM_FLAG => 1,
116  SAM_RNAME => 2,
117  SAM_POS => 3,
118  SAM_MAPQ => 4,
119  SAM_CIGAR => 5,
120  SAM_MRNM => 6,
121  SAM_MPOS => 7,
122  SAM_ISIZE => 8,
123  SAM_SEQ => 9,
124  SAM_QUAL => 10,
125};
126
127
128# function prototypes for Richard's code
129sub match_desc_to_cigar($);
130sub match_desc_frag_length($);
131sub reverse_compl_match_descriptor($);
132sub write_header($;$;$);
133
134
135&export2sam;
136exit;
137
138
139
140
141sub export2sam {
142
143  my $cmdline = $0 . " " . join(" ",@ARGV);
144  my $arg_count = scalar @ARGV;
145  my @spval = File::Spec->splitpath($0);
146  my $progname = $spval[2];
147
148  my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values
149  my $is_nofilter = 0;
150  my $read1file;
151  my $read2file;
152  my $print_version = 0;
153  my $help = 0;
154
155  my $result = GetOptions( "qlogodds" => \$is_logodds_qvals,
156                           "nofilter" => \$is_nofilter,
157                           "read1=s"  => \$read1file,
158                           "read2=s"  => \$read2file,
159                           "version"  => \$print_version,
160                           "help"     => \$help );
161
162  my $usage = <<END;
163
164$progname converts GERALD export files to SAM format.
165
166Usage: $progname --read1=FILENAME [ options ] | --version | --help
167
168  --read1=FILENAME  read1 export file (mandatory)
169  --read2=FILENAME  read2 export file
170  --nofilter        include reads that failed the pipeline/RTA purity filter
171  --qlogodds        assume export file(s) use logodds quality values as reported
172                      by pipeline prior to v1.3 (default: phred quality values)
173
174END
175
176  my $version_msg = <<END;
177
178$progname version: $version
179
180END
181
182  if((not $result) or $help or ($arg_count==0)) {
183    die($usage);
184  }
185
186  if(@ARGV) {
187    print STDERR "\nERROR: Unrecognized arguments: " . join(" ",@ARGV) . "\n\n";
188    die($usage);
189  }
190
191  if($print_version) {
192    die($version_msg);
193  }
194
195  if(not defined($read1file)) {
196    print STDERR "\nERROR: read1 export file must be specified\n\n";
197    die($usage);
198  }
199
200  my ($fh1, $fh2, $is_paired);
201
202  open($fh1, $read1file) or die("\nERROR: Can't open read1 export file: $read1file\n\n");
203  $is_paired = defined $read2file;
204  if ($is_paired) {
205    open($fh2, $read2file) or die("\nERROR: Can't open read2 export file: $read2file\n\n");
206  }
207  # quality value conversion table
208  my @conv_table;
209  if($is_logodds_qvals){ # convert from solexa+64 quality values (pipeline pre-v1.3):
210    for (-64..64) {
211      $conv_table[$_+64] = int(33 + 10*log(1+10**($_/10.0))/log(10)+.499);
212    }
213  } else {               # convert from phred+64 quality values (pipeline v1.3+):
214    for (-64..-1) {
215      $conv_table[$_+64] = undef;
216    }
217    for (0..64) {
218      $conv_table[$_+64] = int(33 + $_);
219    }
220  }
221  # write the header
222  print write_header( $progname, $version, $cmdline );
223  # core loop
224  my $export_line_count = 0;
225  while (<$fh1>) {
226    $export_line_count++;
227    my (@s1, @s2);
228    &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter);
229    if ($is_paired) {
230      my $read2line = <$fh2>;
231      if(not $read2line){
232        die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n  Extra reads observed in read1 file at line no: $export_line_count.\n\n");
233      }
234      &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter);
235
236      if (@s1 && @s2) { # then set mate coordinate
237        if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){
238          die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n  Read1: $_  Read2: $read2line\n");
239        }
240
241        my $isize = 0;
242        if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize
243          my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS];
244          my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS];
245          $isize = $x2 - $x1;
246        }
247
248        foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){
249          my ($sa,$sb,$is) = @{$_};
250          if ($sb->[SAM_RNAME] ne '*') {
251            $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME];
252            $sa->[SAM_MPOS] = $sb->[SAM_POS];
253            $sa->[SAM_ISIZE] = $is;
254            $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10);
255          } else {
256            $sa->[SAM_FLAG] |= 0x8;
257          }
258        }
259      }
260    }
261    print join("\t", @s1), "\n" if (@s1);
262    print join("\t", @s2), "\n" if (@s2 && $is_paired);
263  }
264  close($fh1);
265  if($is_paired) {
266    while(my $read2line = <$fh2>){
267      $export_line_count++;
268      die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n  Extra reads observed in read2 file at line no: $export_line_count.\n\n");
269    }
270    close($fh2);
271  }
272}
273
274sub export2sam_aux {
275  my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_;
276  chomp($line);
277  my @t = split("\t", $line);
278  @$s = ();
279  my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y');
280  return if(not ($isPassFilt or $is_nofilter));
281  # read name
282  $s->[SAM_QNAME] = $t[1]? "$t[0]_$t[1]:$t[2]:$t[3]:$t[4]:$t[5]" : "$t[0]:$t[2]:$t[3]:$t[4]:$t[5]";
283  # initial flag (will be updated later)
284  $s->[SAM_FLAG] = 0;
285  if($is_paired) {
286    if($t[EXPORT_READNO] != $read_no){
287      die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n");
288    }
289    $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no);
290  }
291  $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt);
292
293  # read & quality
294  my $is_export_rev = ($t[EXPORT_STRAND] eq 'R');
295  if ($is_export_rev) { # then reverse the sequence and quality
296    $s->[SAM_SEQ] = reverse($t[EXPORT_READ]);
297    $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/;
298    $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]);
299  } else {
300    $s->[SAM_SEQ] = $t[EXPORT_READ];
301    $s->[SAM_QUAL] = $t[EXPORT_QUAL];
302  }
303  my @convqual = ();
304  foreach (unpack('C*', $s->[SAM_QUAL])){
305    my $val=$ct->[$_];
306    if(not defined $val){
307      my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n";
308      if( $_ < 64 ) { $msg .= "  Use --qlogodds flag to translate logodds (solexa) quality values.\n"; }
309      die($msg . "\n");
310    }
311    push @convqual,$val;
312  }
313
314  $s->[SAM_QUAL] = pack('C*',@convqual); # change coding
315
316
317  # coor
318  my $has_coor = 0;
319  $s->[SAM_RNAME] = "*";
320  if ($t[EXPORT_CHROM] eq 'NM' or $t[EXPORT_CHROM] eq 'QC') {
321    $s->[SAM_FLAG] |= 0x4; # unmapped
322  } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) {
323    $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case?
324    push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3")
325  } elsif ($t[EXPORT_POS] < 1) {
326    $s->[SAM_FLAG] |= 0x4; # unmapped
327  } else {
328    $s->[SAM_RNAME] = $t[EXPORT_CHROM];
329    $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne '');
330    $has_coor = 1;
331  }
332  $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0;
333
334#  print STDERR "t[14] = " . $t[14] . "\n";
335  my $matchDesc = '';
336  $s->[SAM_CIGAR] = "*";
337  if($has_coor){
338    $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD];
339
340    if($matchDesc =~ /\^/){
341      # construct CIGAR string using Richard's function
342      $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing
343    } else {
344      $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M";
345    }
346  }
347
348#  print STDERR "cigar_string = $cigar_string\n";
349
350  $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev);
351  if($has_coor){
352    my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0;
353    my $pemap = 0;
354    if($is_paired) {
355      $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0;
356
357      # set `proper pair' bit if non-blank, non-zero PE alignment score:
358      $s->[SAM_FLAG] |= 0x02 if ($pemap > 0);
359    }
360    $s->[SAM_MAPQ] = min(254,max($semap,$pemap));
361  } else {
362    $s->[SAM_MAPQ] = 0;
363  }
364  # mate coordinate
365  $s->[SAM_MRNM] = '*';
366  $s->[SAM_MPOS] = 0;
367  $s->[SAM_ISIZE] = 0;
368  # aux
369  push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]);
370  if($has_coor){
371    # The export match descriptor differs slightly from the samtools match descriptor.
372    # In order for the converted SAM files to be as compliant as possible,
373    # we put the export match descriptor in optional field 'XD' rather than 'MD':
374    push(@$s, "XD:Z:$matchDesc");
375    push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne '');
376    push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne ''));
377  }
378}
379
380
381
382#
383# the following code is taken from Richard Shaw's sorted2sam.pl file
384#
385sub reverse_compl_match_descriptor($)
386{
387#    print "\nREVERSING THE MATCH DESCRIPTOR!\n";
388    my ($match_desc) = @_;
389    my $rev_compl_match_desc = reverse($match_desc);
390    $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/;
391
392    # Unreverse the digits of numbers.
393    $rev_compl_match_desc = join('',
394                                 map {($_ =~ /\d+/)
395                                      ? join('', reverse(split('', $_)))
396                                      : $_} split(/(\d+)/,
397                                                  $rev_compl_match_desc));
398
399    return $rev_compl_match_desc;
400}
401
402
403
404sub match_desc_to_cigar($)
405{
406    my ($match_desc) = @_;
407
408    my @match_desc_parts = split(/(\^.*?\$)/, $match_desc);
409    my $cigar_str = '';
410    my $cigar_del_ch = 'D';
411    my $cigar_ins_ch = 'I';
412    my $cigar_match_ch = 'M';
413
414    foreach my $match_desc_part (@match_desc_parts) {
415        next if (!$match_desc_part);
416
417        if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) {
418            # Deletion
419            $cigar_str .= (length($1) . $cigar_del_ch);
420        } elsif ($match_desc_part =~ /^\^(\d+)\$$/) {
421            # Insertion
422            $cigar_str .= ($1 . $cigar_ins_ch);
423        } else {
424            $cigar_str .= (match_desc_frag_length($match_desc_part)
425                           . $cigar_match_ch);
426        }
427    }
428
429    return $cigar_str;
430}
431
432
433#------------------------------------------------------------------------------
434
435sub match_desc_frag_length($)
436                           {
437    my ($match_desc_str) = @_;
438    my $len = 0;
439
440    my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str);
441
442    foreach my $match_desc_field (@match_desc_fields) {
443        next if ($match_desc_field eq '');
444
445        $len += (($match_desc_field =~ /(\d+)/)
446                 ? $1 : length($match_desc_field));
447    }
448
449    return $len;
450}
451
452
453# argument holds the command line
454sub write_header($;$;$)
455{
456	my ($progname,$version,$cl) = @_;
457	my $complete_header = "";
458	$complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n";
459
460	return $complete_header;
461}
462